LLVM 22.0.0git
HexagonVectorCombine.cpp
Go to the documentation of this file.
1//===-- HexagonVectorCombine.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// HexagonVectorCombine is a utility class implementing a variety of functions
9// that assist in vector-based optimizations.
10//
11// AlignVectors: replace unaligned vector loads and stores with aligned ones.
12// HvxIdioms: recognize various opportunities to generate HVX intrinsic code.
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/DenseMap.h"
18#include "llvm/ADT/STLExtras.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/IRBuilder.h"
33#include "llvm/IR/Intrinsics.h"
34#include "llvm/IR/IntrinsicsHexagon.h"
35#include "llvm/IR/Metadata.h"
38#include "llvm/Pass.h"
45
46#include "Hexagon.h"
47#include "HexagonSubtarget.h"
49
50#include <algorithm>
51#include <deque>
52#include <map>
53#include <optional>
54#include <set>
55#include <utility>
56#include <vector>
57
58#define DEBUG_TYPE "hexagon-vc"
59
60// This is a const that represents default HVX VTCM page size.
61// It is boot time configurable, so we probably want an API to
62// read it, but for now assume 128KB
63#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072
64
65using namespace llvm;
66
67namespace {
68cl::opt<bool> DumpModule("hvc-dump-module", cl::Hidden);
69cl::opt<bool> VAEnabled("hvc-va", cl::Hidden, cl::init(true)); // Align
70cl::opt<bool> VIEnabled("hvc-vi", cl::Hidden, cl::init(true)); // Idioms
71cl::opt<bool> VADoFullStores("hvc-va-full-stores", cl::Hidden);
72
73cl::opt<unsigned> VAGroupCountLimit("hvc-va-group-count-limit", cl::Hidden,
74 cl::init(~0));
75cl::opt<unsigned> VAGroupSizeLimit("hvc-va-group-size-limit", cl::Hidden,
76 cl::init(~0));
77
78class HexagonVectorCombine {
79public:
80 HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
82 TargetLibraryInfo &TLI_, const TargetMachine &TM_)
83 : F(F_), DL(F.getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
84 SE(SE_), TLI(TLI_),
85 HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}
86
87 bool run();
88
89 // Common integer type.
90 IntegerType *getIntTy(unsigned Width = 32) const;
91 // Byte type: either scalar (when Length = 0), or vector with given
92 // element count.
93 Type *getByteTy(int ElemCount = 0) const;
94 // Boolean type: either scalar (when Length = 0), or vector with given
95 // element count.
96 Type *getBoolTy(int ElemCount = 0) const;
97 // Create a ConstantInt of type returned by getIntTy with the value Val.
98 ConstantInt *getConstInt(int Val, unsigned Width = 32) const;
99 // Get the integer value of V, if it exists.
100 std::optional<APInt> getIntValue(const Value *Val) const;
101 // Is Val a constant 0, or a vector of 0s?
102 bool isZero(const Value *Val) const;
103 // Is Val an undef value?
104 bool isUndef(const Value *Val) const;
105 // Is Val a scalar (i1 true) or a vector of (i1 true)?
106 bool isTrue(const Value *Val) const;
107 // Is Val a scalar (i1 false) or a vector of (i1 false)?
108 bool isFalse(const Value *Val) const;
109
110 // Get HVX vector type with the given element type.
111 VectorType *getHvxTy(Type *ElemTy, bool Pair = false) const;
112
113 enum SizeKind {
114 Store, // Store size
115 Alloc, // Alloc size
116 };
117 int getSizeOf(const Value *Val, SizeKind Kind = Store) const;
118 int getSizeOf(const Type *Ty, SizeKind Kind = Store) const;
119 int getTypeAlignment(Type *Ty) const;
120 size_t length(Value *Val) const;
121 size_t length(Type *Ty) const;
122
123 Constant *getNullValue(Type *Ty) const;
124 Constant *getFullValue(Type *Ty) const;
125 Constant *getConstSplat(Type *Ty, int Val) const;
126
127 Value *simplify(Value *Val) const;
128
129 Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,
130 int Length, int Where) const;
131 Value *vlalignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
132 Value *Amt) const;
133 Value *vralignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
134 Value *Amt) const;
135 Value *concat(IRBuilderBase &Builder, ArrayRef<Value *> Vecs) const;
136 Value *vresize(IRBuilderBase &Builder, Value *Val, int NewSize,
137 Value *Pad) const;
138 Value *rescale(IRBuilderBase &Builder, Value *Mask, Type *FromTy,
139 Type *ToTy) const;
140 Value *vlsb(IRBuilderBase &Builder, Value *Val) const;
141 Value *vbytes(IRBuilderBase &Builder, Value *Val) const;
142 Value *subvector(IRBuilderBase &Builder, Value *Val, unsigned Start,
143 unsigned Length) const;
144 Value *sublo(IRBuilderBase &Builder, Value *Val) const;
145 Value *subhi(IRBuilderBase &Builder, Value *Val) const;
146 Value *vdeal(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
147 Value *vshuff(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
148
149 Value *createHvxIntrinsic(IRBuilderBase &Builder, Intrinsic::ID IntID,
150 Type *RetTy, ArrayRef<Value *> Args,
151 ArrayRef<Type *> ArgTys = {},
152 ArrayRef<Value *> MDSources = {}) const;
153 SmallVector<Value *> splitVectorElements(IRBuilderBase &Builder, Value *Vec,
154 unsigned ToWidth) const;
155 Value *joinVectorElements(IRBuilderBase &Builder, ArrayRef<Value *> Values,
156 VectorType *ToType) const;
157
158 std::optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
159
160 unsigned getNumSignificantBits(const Value *V,
161 const Instruction *CtxI = nullptr) const;
162 KnownBits getKnownBits(const Value *V,
163 const Instruction *CtxI = nullptr) const;
164
165 bool isSafeToClone(const Instruction &In) const;
166
167 template <typename T = std::vector<Instruction *>>
168 bool isSafeToMoveBeforeInBB(const Instruction &In,
170 const T &IgnoreInsts = {}) const;
171
172 // This function is only used for assertions at the moment.
173 [[maybe_unused]] bool isByteVecTy(Type *Ty) const;
174
175 Function &F;
176 const DataLayout &DL;
178 AssumptionCache &AC;
179 DominatorTree &DT;
180 ScalarEvolution &SE;
182 const HexagonSubtarget &HST;
183
184private:
185 Value *getElementRange(IRBuilderBase &Builder, Value *Lo, Value *Hi,
186 int Start, int Length) const;
187};
188
189class AlignVectors {
190 // This code tries to replace unaligned vector loads/stores with aligned
191 // ones.
192 // Consider unaligned load:
193 // %v = original_load %some_addr, align <bad>
194 // %user = %v
195 // It will generate
196 // = load ..., align <good>
197 // = load ..., align <good>
198 // = valign
199 // etc.
200 // %synthesize = combine/shuffle the loaded data so that it looks
201 // exactly like what "original_load" has loaded.
202 // %user = %synthesize
203 // Similarly for stores.
204public:
205 AlignVectors(const HexagonVectorCombine &HVC_) : HVC(HVC_) {}
206
207 bool run();
208
209private:
210 using InstList = std::vector<Instruction *>;
212
213 struct AddrInfo {
214 AddrInfo(const AddrInfo &) = default;
215 AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
216 Align H)
217 : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
218 NeedAlign(HVC.getTypeAlignment(ValTy)) {}
219 AddrInfo &operator=(const AddrInfo &) = default;
220
221 // XXX: add Size member?
222 Instruction *Inst;
223 Value *Addr;
224 Type *ValTy;
225 Align HaveAlign;
226 Align NeedAlign;
227 int Offset = 0; // Offset (in bytes) from the first member of the
228 // containing AddrList.
229 };
230 using AddrList = std::vector<AddrInfo>;
231
232 struct InstrLess {
233 bool operator()(const Instruction *A, const Instruction *B) const {
234 return A->comesBefore(B);
235 }
236 };
237 using DepList = std::set<Instruction *, InstrLess>;
238
239 struct MoveGroup {
240 MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
241 : Base(B), Main{AI.Inst}, Clones{}, IsHvx(Hvx), IsLoad(Load) {}
242 MoveGroup() = default;
243 Instruction *Base; // Base instruction of the parent address group.
244 InstList Main; // Main group of instructions.
245 InstList Deps; // List of dependencies.
246 InstMap Clones; // Map from original Deps to cloned ones.
247 bool IsHvx; // Is this group of HVX instructions?
248 bool IsLoad; // Is this a load group?
249 };
250 using MoveList = std::vector<MoveGroup>;
251
252 struct ByteSpan {
253 // A representation of "interesting" bytes within a given span of memory.
254 // These bytes are those that are loaded or stored, and they don't have
255 // to cover the entire span of memory.
256 //
257 // The representation works by picking a contiguous sequence of bytes
258 // from somewhere within a llvm::Value, and placing it at a given offset
259 // within the span.
260 //
261 // The sequence of bytes from llvm:Value is represented by Segment.
262 // Block is Segment, plus where it goes in the span.
263 //
264 // An important feature of ByteSpan is being able to make a "section",
265 // i.e. creating another ByteSpan corresponding to a range of offsets
266 // relative to the source span.
267
268 struct Segment {
269 // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
270 Segment(Value *Val, int Begin, int Len)
271 : Val(Val), Start(Begin), Size(Len) {}
272 Segment(const Segment &Seg) = default;
273 Segment &operator=(const Segment &Seg) = default;
274 Value *Val; // Value representable as a sequence of bytes.
275 int Start; // First byte of the value that belongs to the segment.
276 int Size; // Number of bytes in the segment.
277 };
278
279 struct Block {
280 Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
281 Block(Value *Val, int Off, int Len, int Pos)
282 : Seg(Val, Off, Len), Pos(Pos) {}
283 Block(const Block &Blk) = default;
284 Block &operator=(const Block &Blk) = default;
285 Segment Seg; // Value segment.
286 int Pos; // Position (offset) of the block in the span.
287 };
288
289 int extent() const;
290 ByteSpan section(int Start, int Length) const;
291 ByteSpan &shift(int Offset);
292 SmallVector<Value *, 8> values() const;
293
294 int size() const { return Blocks.size(); }
295 Block &operator[](int i) { return Blocks[i]; }
296 const Block &operator[](int i) const { return Blocks[i]; }
297
298 std::vector<Block> Blocks;
299
300 using iterator = decltype(Blocks)::iterator;
301 iterator begin() { return Blocks.begin(); }
302 iterator end() { return Blocks.end(); }
303 using const_iterator = decltype(Blocks)::const_iterator;
304 const_iterator begin() const { return Blocks.begin(); }
305 const_iterator end() const { return Blocks.end(); }
306 };
307
308 std::optional<AddrInfo> getAddrInfo(Instruction &In) const;
309 bool isHvx(const AddrInfo &AI) const;
310 // This function is only used for assertions at the moment.
311 [[maybe_unused]] bool isSectorTy(Type *Ty) const;
312
313 Value *getPayload(Value *Val) const;
314 Value *getMask(Value *Val) const;
315 Value *getPassThrough(Value *Val) const;
316
317 Value *createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
318 int Adjust,
319 const InstMap &CloneMap = InstMap()) const;
320 Value *createAlignedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
321 int Alignment,
322 const InstMap &CloneMap = InstMap()) const;
323
324 Value *createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
325 Value *Predicate, int Alignment, Value *Mask,
326 Value *PassThru, ArrayRef<Value *> MDSources = {}) const;
327 Value *createSimpleLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
328 int Alignment,
329 ArrayRef<Value *> MDSources = {}) const;
330
331 Value *createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
332 Value *Predicate, int Alignment, Value *Mask,
333 ArrayRef<Value *> MDSources = {}) const;
334 Value *createSimpleStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
335 int Alignment,
336 ArrayRef<Value *> MDSources = {}) const;
337
338 Value *createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
339 Value *Predicate, int Alignment,
340 ArrayRef<Value *> MDSources = {}) const;
341 Value *createPredicatedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
342 Value *Predicate, int Alignment,
343 ArrayRef<Value *> MDSources = {}) const;
344
345 DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
346 bool createAddressGroups();
347 MoveList createLoadGroups(const AddrList &Group) const;
348 MoveList createStoreGroups(const AddrList &Group) const;
349 bool moveTogether(MoveGroup &Move) const;
350 template <typename T>
351 InstMap cloneBefore(BasicBlock::iterator To, T &&Insts) const;
352
353 void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
354 int ScLen, Value *AlignVal, Value *AlignAddr) const;
355 void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
356 int ScLen, Value *AlignVal, Value *AlignAddr) const;
357 bool realignGroup(const MoveGroup &Move) const;
358
359 Value *makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
360 int Alignment) const;
361
362 friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
363 friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
364 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
365 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
366
367 std::map<Instruction *, AddrList> AddrGroups;
368 const HexagonVectorCombine &HVC;
369};
370
371[[maybe_unused]]
372raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {
373 OS << "Inst: " << AI.Inst << " " << *AI.Inst << '\n';
374 OS << "Addr: " << *AI.Addr << '\n';
375 OS << "Type: " << *AI.ValTy << '\n';
376 OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
377 OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
378 OS << "Offset: " << AI.Offset;
379 return OS;
380}
381
382[[maybe_unused]]
383raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
384 OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
385 OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
386 OS << "Main\n";
387 for (Instruction *I : MG.Main)
388 OS << " " << *I << '\n';
389 OS << "Deps\n";
390 for (Instruction *I : MG.Deps)
391 OS << " " << *I << '\n';
392 OS << "Clones\n";
393 for (auto [K, V] : MG.Clones) {
394 OS << " ";
395 K->printAsOperand(OS, false);
396 OS << "\t-> " << *V << '\n';
397 }
398 return OS;
399}
400
401[[maybe_unused]]
403 const AlignVectors::ByteSpan::Block &B) {
404 OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
405 if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {
406 OS << "(self:" << B.Seg.Val << ')';
407 } else if (B.Seg.Val != nullptr) {
408 OS << *B.Seg.Val;
409 } else {
410 OS << "(null)";
411 }
412 return OS;
413}
414
415[[maybe_unused]]
416raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
417 OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
418 for (const AlignVectors::ByteSpan::Block &B : BS)
419 OS << B << '\n';
420 OS << ']';
421 return OS;
422}
423
424class HvxIdioms {
425public:
426 enum DstQualifier {
427 Undefined = 0,
428 Arithmetic,
429 LdSt,
430 LLVM_Gather,
431 LLVM_Scatter,
432 HEX_Gather_Scatter,
433 HEX_Gather,
434 HEX_Scatter,
435 Call
436 };
437
438 HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
439 auto *Int32Ty = HVC.getIntTy(32);
440 HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
441 HvxP32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/true);
442 }
443
444 bool run();
445
446private:
447 enum Signedness { Positive, Signed, Unsigned };
448
449 // Value + sign
450 // This is to keep track of whether the value should be treated as signed
451 // or unsigned, or is known to be positive.
452 struct SValue {
453 Value *Val;
454 Signedness Sgn;
455 };
456
457 struct FxpOp {
458 unsigned Opcode;
459 unsigned Frac; // Number of fraction bits
460 SValue X, Y;
461 // If present, add 1 << RoundAt before shift:
462 std::optional<unsigned> RoundAt;
463 VectorType *ResTy;
464 };
465
466 auto getNumSignificantBits(Value *V, Instruction *In) const
467 -> std::pair<unsigned, Signedness>;
468 auto canonSgn(SValue X, SValue Y) const -> std::pair<SValue, SValue>;
469
470 auto matchFxpMul(Instruction &In) const -> std::optional<FxpOp>;
471 auto processFxpMul(Instruction &In, const FxpOp &Op) const -> Value *;
472
473 auto processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
474 const FxpOp &Op) const -> Value *;
475 auto createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
476 bool Rounding) const -> Value *;
477 auto createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
478 bool Rounding) const -> Value *;
479 // Return {Result, Carry}, where Carry is a vector predicate.
480 auto createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
481 Value *CarryIn = nullptr) const
482 -> std::pair<Value *, Value *>;
483 auto createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const -> Value *;
484 auto createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
485 -> Value *;
486 auto createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
487 -> std::pair<Value *, Value *>;
488 auto createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
490 auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
491 Signedness SgnX, ArrayRef<Value *> WordY,
492 Signedness SgnY) const -> SmallVector<Value *>;
493 // Vector manipulations for Ripple
494 bool matchScatter(Instruction &In) const;
495 bool matchGather(Instruction &In) const;
496 Value *processVScatter(Instruction &In) const;
497 Value *processVGather(Instruction &In) const;
498
499 VectorType *HvxI32Ty;
500 VectorType *HvxP32Ty;
501 const HexagonVectorCombine &HVC;
502
503 friend raw_ostream &operator<<(raw_ostream &, const FxpOp &);
504};
505
506[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
507 const HvxIdioms::FxpOp &Op) {
508 static const char *SgnNames[] = {"Positive", "Signed", "Unsigned"};
509 OS << Instruction::getOpcodeName(Op.Opcode) << '.' << Op.Frac;
510 if (Op.RoundAt.has_value()) {
511 if (Op.Frac != 0 && *Op.RoundAt == Op.Frac - 1) {
512 OS << ":rnd";
513 } else {
514 OS << " + 1<<" << *Op.RoundAt;
515 }
516 }
517 OS << "\n X:(" << SgnNames[Op.X.Sgn] << ") " << *Op.X.Val << "\n"
518 << " Y:(" << SgnNames[Op.Y.Sgn] << ") " << *Op.Y.Val;
519 return OS;
520}
521
522} // namespace
523
524namespace {
525
526template <typename T> T *getIfUnordered(T *MaybeT) {
527 return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
528}
529template <typename T> T *isCandidate(Instruction *In) {
530 return dyn_cast<T>(In);
531}
533 return getIfUnordered(dyn_cast<LoadInst>(In));
534}
536 return getIfUnordered(dyn_cast<StoreInst>(In));
537}
538
539#if !defined(_MSC_VER) || _MSC_VER >= 1926
540// VS2017 and some versions of VS2019 have trouble compiling this:
541// error C2976: 'std::map': too few template arguments
542// VS 2019 16.x is known to work, except for 16.4/16.5 (MSC_VER 1924/1925)
543template <typename Pred, typename... Ts>
544void erase_if(std::map<Ts...> &map, Pred p)
545#else
546template <typename Pred, typename T, typename U>
547void erase_if(std::map<T, U> &map, Pred p)
548#endif
549{
550 for (auto i = map.begin(), e = map.end(); i != e;) {
551 if (p(*i))
552 i = map.erase(i);
553 else
554 i = std::next(i);
555 }
556}
557
558// Forward other erase_ifs to the LLVM implementations.
559template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
560 llvm::erase_if(std::forward<T>(container), p);
561}
562
563} // namespace
564
565// --- Begin AlignVectors
566
567// For brevity, only consider loads. We identify a group of loads where we
568// know the relative differences between their addresses, so we know how they
569// are laid out in memory (relative to one another). These loads can overlap,
570// can be shorter or longer than the desired vector length.
571// Ultimately we want to generate a sequence of aligned loads that will load
572// every byte that the original loads loaded, and have the program use these
573// loaded values instead of the original loads.
574// We consider the contiguous memory area spanned by all these loads.
575//
576// Let's say that a single aligned vector load can load 16 bytes at a time.
577// If the program wanted to use a byte at offset 13 from the beginning of the
578// original span, it will be a byte at offset 13+x in the aligned data for
579// some x>=0. This may happen to be in the first aligned load, or in the load
580// following it. Since we generally don't know what the that alignment value
581// is at compile time, we proactively do valigns on the aligned loads, so that
582// byte that was at offset 13 is still at offset 13 after the valigns.
583//
584// This will be the starting point for making the rest of the program use the
585// data loaded by the new loads.
586// For each original load, and its users:
587// %v = load ...
588// ... = %v
589// ... = %v
590// we create
591// %new_v = extract/combine/shuffle data from loaded/valigned vectors so
592// it contains the same value as %v did before
593// then replace all users of %v with %new_v.
594// ... = %new_v
595// ... = %new_v
596
597auto AlignVectors::ByteSpan::extent() const -> int {
598 if (size() == 0)
599 return 0;
600 int Min = Blocks[0].Pos;
601 int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
602 for (int i = 1, e = size(); i != e; ++i) {
603 Min = std::min(Min, Blocks[i].Pos);
604 Max = std::max(Max, Blocks[i].Pos + Blocks[i].Seg.Size);
605 }
606 return Max - Min;
607}
608
609auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
610 ByteSpan Section;
611 for (const ByteSpan::Block &B : Blocks) {
612 int L = std::max(B.Pos, Start); // Left end.
613 int R = std::min(B.Pos + B.Seg.Size, Start + Length); // Right end+1.
614 if (L < R) {
615 // How much to chop off the beginning of the segment:
616 int Off = L > B.Pos ? L - B.Pos : 0;
617 Section.Blocks.emplace_back(B.Seg.Val, B.Seg.Start + Off, R - L, L);
618 }
619 }
620 return Section;
621}
622
623auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
624 for (Block &B : Blocks)
625 B.Pos += Offset;
626 return *this;
627}
628
629auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
630 SmallVector<Value *, 8> Values(Blocks.size());
631 for (int i = 0, e = Blocks.size(); i != e; ++i)
632 Values[i] = Blocks[i].Seg.Val;
633 return Values;
634}
635
636auto AlignVectors::getAddrInfo(Instruction &In) const
637 -> std::optional<AddrInfo> {
638 if (auto *L = isCandidate<LoadInst>(&In))
639 return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
640 L->getAlign());
641 if (auto *S = isCandidate<StoreInst>(&In))
642 return AddrInfo(HVC, S, S->getPointerOperand(),
643 S->getValueOperand()->getType(), S->getAlign());
644 if (auto *II = isCandidate<IntrinsicInst>(&In)) {
645 Intrinsic::ID ID = II->getIntrinsicID();
646 switch (ID) {
647 case Intrinsic::masked_load:
648 return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),
649 II->getParamAlign(0).valueOrOne());
650 case Intrinsic::masked_store:
651 return AddrInfo(HVC, II, II->getArgOperand(1),
652 II->getArgOperand(0)->getType(),
653 II->getParamAlign(1).valueOrOne());
654 }
655 }
656 return std::nullopt;
657}
658
659auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
660 return HVC.HST.isTypeForHVX(AI.ValTy);
661}
662
663auto AlignVectors::getPayload(Value *Val) const -> Value * {
664 if (auto *In = dyn_cast<Instruction>(Val)) {
665 Intrinsic::ID ID = 0;
666 if (auto *II = dyn_cast<IntrinsicInst>(In))
667 ID = II->getIntrinsicID();
668 if (isa<StoreInst>(In) || ID == Intrinsic::masked_store)
669 return In->getOperand(0);
670 }
671 return Val;
672}
673
674auto AlignVectors::getMask(Value *Val) const -> Value * {
675 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
676 switch (II->getIntrinsicID()) {
677 case Intrinsic::masked_load:
678 return II->getArgOperand(1);
679 case Intrinsic::masked_store:
680 return II->getArgOperand(2);
681 }
682 }
683
684 Type *ValTy = getPayload(Val)->getType();
685 if (auto *VecTy = dyn_cast<VectorType>(ValTy))
686 return HVC.getFullValue(HVC.getBoolTy(HVC.length(VecTy)));
687 return HVC.getFullValue(HVC.getBoolTy());
688}
689
690auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
691 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
692 if (II->getIntrinsicID() == Intrinsic::masked_load)
693 return II->getArgOperand(2);
694 }
695 return UndefValue::get(getPayload(Val)->getType());
696}
697
698auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
699 Type *ValTy, int Adjust,
700 const InstMap &CloneMap) const
701 -> Value * {
702 if (auto *I = dyn_cast<Instruction>(Ptr))
703 if (Instruction *New = CloneMap.lookup(I))
704 Ptr = New;
705 return Builder.CreatePtrAdd(Ptr, HVC.getConstInt(Adjust), "gep");
706}
707
708auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
709 Type *ValTy, int Alignment,
710 const InstMap &CloneMap) const
711 -> Value * {
712 auto remap = [&](Value *V) -> Value * {
713 if (auto *I = dyn_cast<Instruction>(V)) {
714 for (auto [Old, New] : CloneMap)
715 I->replaceUsesOfWith(Old, New);
716 return I;
717 }
718 return V;
719 };
720 Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy(), "pti");
721 Value *Mask = HVC.getConstInt(-Alignment);
722 Value *And = Builder.CreateAnd(remap(AsInt), Mask, "and");
723 return Builder.CreateIntToPtr(
724 And, PointerType::getUnqual(ValTy->getContext()), "itp");
725}
726
727auto AlignVectors::createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
728 Value *Predicate, int Alignment, Value *Mask,
729 Value *PassThru,
730 ArrayRef<Value *> MDSources) const -> Value * {
731 bool HvxHasPredLoad = HVC.HST.useHVXV62Ops();
732 // Predicate is nullptr if not creating predicated load
733 if (Predicate) {
734 assert(!Predicate->getType()->isVectorTy() &&
735 "Expectning scalar predicate");
736 if (HVC.isFalse(Predicate))
737 return UndefValue::get(ValTy);
738 if (!HVC.isTrue(Predicate) && HvxHasPredLoad) {
739 Value *Load = createPredicatedLoad(Builder, ValTy, Ptr, Predicate,
740 Alignment, MDSources);
741 return Builder.CreateSelect(Mask, Load, PassThru);
742 }
743 // Predicate == true here.
744 }
745 assert(!HVC.isUndef(Mask)); // Should this be allowed?
746 if (HVC.isZero(Mask))
747 return PassThru;
748 if (HVC.isTrue(Mask))
749 return createSimpleLoad(Builder, ValTy, Ptr, Alignment, MDSources);
750
751 Instruction *Load = Builder.CreateMaskedLoad(ValTy, Ptr, Align(Alignment),
752 Mask, PassThru, "mld");
753 propagateMetadata(Load, MDSources);
754 return Load;
755}
756
757auto AlignVectors::createSimpleLoad(IRBuilderBase &Builder, Type *ValTy,
758 Value *Ptr, int Alignment,
759 ArrayRef<Value *> MDSources) const
760 -> Value * {
762 Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment), "ald");
763 propagateMetadata(Load, MDSources);
764 return Load;
765}
766
767auto AlignVectors::createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy,
768 Value *Ptr, Value *Predicate,
769 int Alignment,
770 ArrayRef<Value *> MDSources) const
771 -> Value * {
772 assert(HVC.HST.isTypeForHVX(ValTy) &&
773 "Predicates 'scalar' vector loads not yet supported");
774 assert(Predicate);
775 assert(!Predicate->getType()->isVectorTy() && "Expectning scalar predicate");
776 assert(HVC.getSizeOf(ValTy, HVC.Alloc) % Alignment == 0);
777 if (HVC.isFalse(Predicate))
778 return UndefValue::get(ValTy);
779 if (HVC.isTrue(Predicate))
780 return createSimpleLoad(Builder, ValTy, Ptr, Alignment, MDSources);
781
782 auto V6_vL32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vL32b_pred_ai);
783 // FIXME: This may not put the offset from Ptr into the vmem offset.
784 return HVC.createHvxIntrinsic(Builder, V6_vL32b_pred_ai, ValTy,
785 {Predicate, Ptr, HVC.getConstInt(0)}, {},
786 MDSources);
787}
788
789auto AlignVectors::createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
790 Value *Predicate, int Alignment, Value *Mask,
791 ArrayRef<Value *> MDSources) const -> Value * {
792 if (HVC.isZero(Mask) || HVC.isUndef(Val) || HVC.isUndef(Mask))
793 return UndefValue::get(Val->getType());
794 assert(!Predicate || (!Predicate->getType()->isVectorTy() &&
795 "Expectning scalar predicate"));
796 if (Predicate) {
797 if (HVC.isFalse(Predicate))
798 return UndefValue::get(Val->getType());
799 if (HVC.isTrue(Predicate))
800 Predicate = nullptr;
801 }
802 // Here both Predicate and Mask are true or unknown.
803
804 if (HVC.isTrue(Mask)) {
805 if (Predicate) { // Predicate unknown
806 return createPredicatedStore(Builder, Val, Ptr, Predicate, Alignment,
807 MDSources);
808 }
809 // Predicate is true:
810 return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
811 }
812
813 // Mask is unknown
814 if (!Predicate) {
816 Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
817 propagateMetadata(Store, MDSources);
818 return Store;
819 }
820
821 // Both Predicate and Mask are unknown.
822 // Emulate masked store with predicated-load + mux + predicated-store.
823 Value *PredLoad = createPredicatedLoad(Builder, Val->getType(), Ptr,
824 Predicate, Alignment, MDSources);
825 Value *Mux = Builder.CreateSelect(Mask, Val, PredLoad);
826 return createPredicatedStore(Builder, Mux, Ptr, Predicate, Alignment,
827 MDSources);
828}
829
830auto AlignVectors::createSimpleStore(IRBuilderBase &Builder, Value *Val,
831 Value *Ptr, int Alignment,
832 ArrayRef<Value *> MDSources) const
833 -> Value * {
834 Instruction *Store = Builder.CreateAlignedStore(Val, Ptr, Align(Alignment));
835 propagateMetadata(Store, MDSources);
836 return Store;
837}
838
839auto AlignVectors::createPredicatedStore(IRBuilderBase &Builder, Value *Val,
840 Value *Ptr, Value *Predicate,
841 int Alignment,
842 ArrayRef<Value *> MDSources) const
843 -> Value * {
844 assert(HVC.HST.isTypeForHVX(Val->getType()) &&
845 "Predicates 'scalar' vector stores not yet supported");
846 assert(Predicate);
847 if (HVC.isFalse(Predicate))
848 return UndefValue::get(Val->getType());
849 if (HVC.isTrue(Predicate))
850 return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
851
852 assert(HVC.getSizeOf(Val, HVC.Alloc) % Alignment == 0);
853 auto V6_vS32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vS32b_pred_ai);
854 // FIXME: This may not put the offset from Ptr into the vmem offset.
855 return HVC.createHvxIntrinsic(Builder, V6_vS32b_pred_ai, nullptr,
856 {Predicate, Ptr, HVC.getConstInt(0), Val}, {},
857 MDSources);
858}
859
860auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
861 -> DepList {
862 BasicBlock *Parent = Base->getParent();
863 assert(In->getParent() == Parent &&
864 "Base and In should be in the same block");
865 assert(Base->comesBefore(In) && "Base should come before In");
866
867 DepList Deps;
868 std::deque<Instruction *> WorkQ = {In};
869 while (!WorkQ.empty()) {
870 Instruction *D = WorkQ.front();
871 WorkQ.pop_front();
872 if (D != In)
873 Deps.insert(D);
874 for (Value *Op : D->operands()) {
875 if (auto *I = dyn_cast<Instruction>(Op)) {
876 if (I->getParent() == Parent && Base->comesBefore(I))
877 WorkQ.push_back(I);
878 }
879 }
880 }
881 return Deps;
882}
883
884auto AlignVectors::createAddressGroups() -> bool {
885 // An address group created here may contain instructions spanning
886 // multiple basic blocks.
887 AddrList WorkStack;
888
889 auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
890 for (AddrInfo &W : WorkStack) {
891 if (auto D = HVC.calculatePointerDifference(AI.Addr, W.Addr))
892 return std::make_pair(W.Inst, *D);
893 }
894 return std::make_pair(nullptr, 0);
895 };
896
897 auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
898 BasicBlock &Block = *DomN->getBlock();
899 for (Instruction &I : Block) {
900 auto AI = this->getAddrInfo(I); // Use this-> for gcc6.
901 if (!AI)
902 continue;
903 auto F = findBaseAndOffset(*AI);
904 Instruction *GroupInst;
905 if (Instruction *BI = F.first) {
906 AI->Offset = F.second;
907 GroupInst = BI;
908 } else {
909 WorkStack.push_back(*AI);
910 GroupInst = AI->Inst;
911 }
912 AddrGroups[GroupInst].push_back(*AI);
913 }
914
915 for (DomTreeNode *C : DomN->children())
916 Visit(C, Visit);
917
918 while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
919 WorkStack.pop_back();
920 };
921
922 traverseBlock(HVC.DT.getRootNode(), traverseBlock);
923 assert(WorkStack.empty());
924
925 // AddrGroups are formed.
926
927 // Remove groups of size 1.
928 erase_if(AddrGroups, [](auto &G) { return G.second.size() == 1; });
929 // Remove groups that don't use HVX types.
930 erase_if(AddrGroups, [&](auto &G) {
931 return llvm::none_of(
932 G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
933 });
934
935 return !AddrGroups.empty();
936}
937
938auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
939 // Form load groups.
940 // To avoid complications with moving code across basic blocks, only form
941 // groups that are contained within a single basic block.
942 unsigned SizeLimit = VAGroupSizeLimit;
943 if (SizeLimit == 0)
944 return {};
945
946 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
947 assert(!Move.Main.empty() && "Move group should have non-empty Main");
948 if (Move.Main.size() >= SizeLimit)
949 return false;
950 // Don't mix HVX and non-HVX instructions.
951 if (Move.IsHvx != isHvx(Info))
952 return false;
953 // Leading instruction in the load group.
954 Instruction *Base = Move.Main.front();
955 if (Base->getParent() != Info.Inst->getParent())
956 return false;
957 // Check if it's safe to move the load.
958 if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator()))
959 return false;
960 // And if it's safe to clone the dependencies.
961 auto isSafeToCopyAtBase = [&](const Instruction *I) {
962 return HVC.isSafeToMoveBeforeInBB(*I, Base->getIterator()) &&
963 HVC.isSafeToClone(*I);
964 };
965 DepList Deps = getUpwardDeps(Info.Inst, Base);
966 if (!llvm::all_of(Deps, isSafeToCopyAtBase))
967 return false;
968
969 Move.Main.push_back(Info.Inst);
970 llvm::append_range(Move.Deps, Deps);
971 return true;
972 };
973
974 MoveList LoadGroups;
975
976 for (const AddrInfo &Info : Group) {
977 if (!Info.Inst->mayReadFromMemory())
978 continue;
979 if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
980 LoadGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), true);
981 }
982
983 // Erase singleton groups.
984 erase_if(LoadGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
985
986 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
987 if (!HVC.HST.useHVXV62Ops())
988 erase_if(LoadGroups, [](const MoveGroup &G) { return G.IsHvx; });
989
990 return LoadGroups;
991}
992
993auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
994 // Form store groups.
995 // To avoid complications with moving code across basic blocks, only form
996 // groups that are contained within a single basic block.
997 unsigned SizeLimit = VAGroupSizeLimit;
998 if (SizeLimit == 0)
999 return {};
1000
1001 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
1002 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1003 if (Move.Main.size() >= SizeLimit)
1004 return false;
1005 // For stores with return values we'd have to collect downward dependencies.
1006 // There are no such stores that we handle at the moment, so omit that.
1007 assert(Info.Inst->getType()->isVoidTy() &&
1008 "Not handling stores with return values");
1009 // Don't mix HVX and non-HVX instructions.
1010 if (Move.IsHvx != isHvx(Info))
1011 return false;
1012 // For stores we need to be careful whether it's safe to move them.
1013 // Stores that are otherwise safe to move together may not appear safe
1014 // to move over one another (i.e. isSafeToMoveBefore may return false).
1015 Instruction *Base = Move.Main.front();
1016 if (Base->getParent() != Info.Inst->getParent())
1017 return false;
1018 if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator(), Move.Main))
1019 return false;
1020 Move.Main.push_back(Info.Inst);
1021 return true;
1022 };
1023
1024 MoveList StoreGroups;
1025
1026 for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
1027 const AddrInfo &Info = *I;
1028 if (!Info.Inst->mayWriteToMemory())
1029 continue;
1030 if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
1031 StoreGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), false);
1032 }
1033
1034 // Erase singleton groups.
1035 erase_if(StoreGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
1036
1037 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
1038 if (!HVC.HST.useHVXV62Ops())
1039 erase_if(StoreGroups, [](const MoveGroup &G) { return G.IsHvx; });
1040
1041 // Erase groups where every store is a full HVX vector. The reason is that
1042 // aligning predicated stores generates complex code that may be less
1043 // efficient than a sequence of unaligned vector stores.
1044 if (!VADoFullStores) {
1045 erase_if(StoreGroups, [this](const MoveGroup &G) {
1046 return G.IsHvx && llvm::all_of(G.Main, [this](Instruction *S) {
1047 auto MaybeInfo = this->getAddrInfo(*S);
1048 assert(MaybeInfo.has_value());
1049 return HVC.HST.isHVXVectorType(
1050 EVT::getEVT(MaybeInfo->ValTy, false));
1051 });
1052 });
1053 }
1054
1055 return StoreGroups;
1056}
1057
1058auto AlignVectors::moveTogether(MoveGroup &Move) const -> bool {
1059 // Move all instructions to be adjacent.
1060 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1061 Instruction *Where = Move.Main.front();
1062
1063 if (Move.IsLoad) {
1064 // Move all the loads (and dependencies) to where the first load is.
1065 // Clone all deps to before Where, keeping order.
1066 Move.Clones = cloneBefore(Where->getIterator(), Move.Deps);
1067 // Move all main instructions to after Where, keeping order.
1068 ArrayRef<Instruction *> Main(Move.Main);
1069 for (Instruction *M : Main) {
1070 if (M != Where)
1071 M->moveAfter(Where);
1072 for (auto [Old, New] : Move.Clones)
1073 M->replaceUsesOfWith(Old, New);
1074 Where = M;
1075 }
1076 // Replace Deps with the clones.
1077 for (int i = 0, e = Move.Deps.size(); i != e; ++i)
1078 Move.Deps[i] = Move.Clones[Move.Deps[i]];
1079 } else {
1080 // Move all the stores to where the last store is.
1081 // NOTE: Deps are empty for "store" groups. If they need to be
1082 // non-empty, decide on the order.
1083 assert(Move.Deps.empty());
1084 // Move all main instructions to before Where, inverting order.
1085 ArrayRef<Instruction *> Main(Move.Main);
1086 for (Instruction *M : Main.drop_front(1)) {
1087 M->moveBefore(Where->getIterator());
1088 Where = M;
1089 }
1090 }
1091
1092 return Move.Main.size() + Move.Deps.size() > 1;
1093}
1094
1095template <typename T>
1096auto AlignVectors::cloneBefore(BasicBlock::iterator To, T &&Insts) const
1097 -> InstMap {
1098 InstMap Map;
1099
1100 for (Instruction *I : Insts) {
1101 assert(HVC.isSafeToClone(*I));
1102 Instruction *C = I->clone();
1103 C->setName(Twine("c.") + I->getName() + ".");
1104 C->insertBefore(To);
1105
1106 for (auto [Old, New] : Map)
1107 C->replaceUsesOfWith(Old, New);
1108 Map.insert(std::make_pair(I, C));
1109 }
1110 return Map;
1111}
1112
1113auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
1114 const ByteSpan &VSpan, int ScLen,
1115 Value *AlignVal, Value *AlignAddr) const
1116 -> void {
1117 LLVM_DEBUG(dbgs() << __func__ << "\n");
1118
1119 Type *SecTy = HVC.getByteTy(ScLen);
1120 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1121 bool DoAlign = !HVC.isZero(AlignVal);
1122 BasicBlock::iterator BasePos = Builder.GetInsertPoint();
1123 BasicBlock *BaseBlock = Builder.GetInsertBlock();
1124
1125 ByteSpan ASpan;
1126 auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
1127 auto *Undef = UndefValue::get(SecTy);
1128
1129 // Created load does not have to be "Instruction" (e.g. "undef").
1130 SmallVector<Value *> Loads(NumSectors + DoAlign, nullptr);
1131
1132 // We could create all of the aligned loads, and generate the valigns
1133 // at the location of the first load, but for large load groups, this
1134 // could create highly suboptimal code (there have been groups of 140+
1135 // loads in real code).
1136 // Instead, place the loads/valigns as close to the users as possible.
1137 // In any case we need to have a mapping from the blocks of VSpan (the
1138 // span covered by the pre-existing loads) to ASpan (the span covered
1139 // by the aligned loads). There is a small problem, though: ASpan needs
1140 // to have pointers to the loads/valigns, but we don't have these loads
1141 // because we don't know where to put them yet. We find out by creating
1142 // a section of ASpan that corresponds to values (blocks) from VSpan,
1143 // and checking where the new load should be placed. We need to attach
1144 // this location information to each block in ASpan somehow, so we put
1145 // distincts values for Seg.Val in each ASpan.Blocks[i], and use a map
1146 // to store the location for each Seg.Val.
1147 // The distinct values happen to be Blocks[i].Seg.Val = &Blocks[i],
1148 // which helps with printing ByteSpans without crashing when printing
1149 // Segments with these temporary identifiers in place of Val.
1150
1151 // Populate the blocks first, to avoid reallocations of the vector
1152 // interfering with generating the placeholder addresses.
1153 for (int Index = 0; Index != NumSectors; ++Index)
1154 ASpan.Blocks.emplace_back(nullptr, ScLen, Index * ScLen);
1155 for (int Index = 0; Index != NumSectors; ++Index) {
1156 ASpan.Blocks[Index].Seg.Val =
1157 reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
1158 }
1159
1160 // Multiple values from VSpan can map to the same value in ASpan. Since we
1161 // try to create loads lazily, we need to find the earliest use for each
1162 // value from ASpan.
1163 DenseMap<void *, Instruction *> EarliestUser;
1164 auto isEarlier = [](Instruction *A, Instruction *B) {
1165 if (B == nullptr)
1166 return true;
1167 if (A == nullptr)
1168 return false;
1169 assert(A->getParent() == B->getParent());
1170 return A->comesBefore(B);
1171 };
1172 auto earliestUser = [&](const auto &Uses) {
1173 Instruction *User = nullptr;
1174 for (const Use &U : Uses) {
1175 auto *I = dyn_cast<Instruction>(U.getUser());
1176 assert(I != nullptr && "Load used in a non-instruction?");
1177 // Make sure we only consider users in this block, but we need
1178 // to remember if there were users outside the block too. This is
1179 // because if no users are found, aligned loads will not be created.
1180 if (I->getParent() == BaseBlock) {
1181 if (!isa<PHINode>(I))
1182 User = std::min(User, I, isEarlier);
1183 } else {
1184 User = std::min(User, BaseBlock->getTerminator(), isEarlier);
1185 }
1186 }
1187 return User;
1188 };
1189
1190 for (const ByteSpan::Block &B : VSpan) {
1191 ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);
1192 for (const ByteSpan::Block &S : ASection) {
1193 auto &EU = EarliestUser[S.Seg.Val];
1194 EU = std::min(EU, earliestUser(B.Seg.Val->uses()), isEarlier);
1195 }
1196 }
1197
1198 LLVM_DEBUG({
1199 dbgs() << "ASpan:\n" << ASpan << '\n';
1200 dbgs() << "Earliest users of ASpan:\n";
1201 for (auto &[Val, User] : EarliestUser) {
1202 dbgs() << Val << "\n ->" << *User << '\n';
1203 }
1204 });
1205
1206 auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
1207 int Index, bool MakePred) {
1208 Value *Ptr =
1209 createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1210 Value *Predicate =
1211 MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1212
1213 // If vector shifting is potentially needed, accumulate metadata
1214 // from source sections of twice the load width.
1215 int Start = (Index - DoAlign) * ScLen;
1216 int Width = (1 + DoAlign) * ScLen;
1217 return this->createLoad(Builder, SecTy, Ptr, Predicate, ScLen, True, Undef,
1218 VSpan.section(Start, Width).values());
1219 };
1220
1221 auto moveBefore = [this](BasicBlock::iterator In, BasicBlock::iterator To) {
1222 // Move In and its upward dependencies to before To.
1223 assert(In->getParent() == To->getParent());
1224 DepList Deps = getUpwardDeps(&*In, &*To);
1225 In->moveBefore(To);
1226 // DepList is sorted with respect to positions in the basic block.
1227 InstMap Map = cloneBefore(In, Deps);
1228 for (auto [Old, New] : Map)
1229 In->replaceUsesOfWith(Old, New);
1230 };
1231
1232 // Generate necessary loads at appropriate locations.
1233 LLVM_DEBUG(dbgs() << "Creating loads for ASpan sectors\n");
1234 for (int Index = 0; Index != NumSectors + 1; ++Index) {
1235 // In ASpan, each block will be either a single aligned load, or a
1236 // valign of a pair of loads. In the latter case, an aligned load j
1237 // will belong to the current valign, and the one in the previous
1238 // block (for j > 0).
1239 // Place the load at a location which will dominate the valign, assuming
1240 // the valign will be placed right before the earliest user.
1241 Instruction *PrevAt =
1242 DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
1243 Instruction *ThisAt =
1244 Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
1245 if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {
1246 Builder.SetInsertPoint(Where);
1247 Loads[Index] =
1248 createLoad(Builder, VSpan, Index, DoAlign && Index == NumSectors);
1249 // We know it's safe to put the load at BasePos, but we'd prefer to put
1250 // it at "Where". To see if the load is safe to be placed at Where, put
1251 // it there first and then check if it's safe to move it to BasePos.
1252 // If not, then the load needs to be placed at BasePos.
1253 // We can't do this check proactively because we need the load to exist
1254 // in order to check legality.
1255 if (auto *Load = dyn_cast<Instruction>(Loads[Index])) {
1256 if (!HVC.isSafeToMoveBeforeInBB(*Load, BasePos))
1257 moveBefore(Load->getIterator(), BasePos);
1258 }
1259 LLVM_DEBUG(dbgs() << "Loads[" << Index << "]:" << *Loads[Index] << '\n');
1260 }
1261 }
1262
1263 // Generate valigns if needed, and fill in proper values in ASpan
1264 LLVM_DEBUG(dbgs() << "Creating values for ASpan sectors\n");
1265 for (int Index = 0; Index != NumSectors; ++Index) {
1266 ASpan[Index].Seg.Val = nullptr;
1267 if (auto *Where = EarliestUser[&ASpan[Index]]) {
1268 Builder.SetInsertPoint(Where);
1269 Value *Val = Loads[Index];
1270 assert(Val != nullptr);
1271 if (DoAlign) {
1272 Value *NextLoad = Loads[Index + 1];
1273 assert(NextLoad != nullptr);
1274 Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);
1275 }
1276 ASpan[Index].Seg.Val = Val;
1277 LLVM_DEBUG(dbgs() << "ASpan[" << Index << "]:" << *Val << '\n');
1278 }
1279 }
1280
1281 for (const ByteSpan::Block &B : VSpan) {
1282 ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
1283 Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
1284 Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));
1285
1286 // We're generating a reduction, where each instruction depends on
1287 // the previous one, so we need to order them according to the position
1288 // of their inputs in the code.
1289 std::vector<ByteSpan::Block *> ABlocks;
1290 for (ByteSpan::Block &S : ASection) {
1291 if (S.Seg.Val != nullptr)
1292 ABlocks.push_back(&S);
1293 }
1294 llvm::sort(ABlocks,
1295 [&](const ByteSpan::Block *A, const ByteSpan::Block *B) {
1296 return isEarlier(cast<Instruction>(A->Seg.Val),
1297 cast<Instruction>(B->Seg.Val));
1298 });
1299 for (ByteSpan::Block *S : ABlocks) {
1300 // The processing of the data loaded by the aligned loads
1301 // needs to be inserted after the data is available.
1302 Instruction *SegI = cast<Instruction>(S->Seg.Val);
1303 Builder.SetInsertPoint(&*std::next(SegI->getIterator()));
1304 Value *Pay = HVC.vbytes(Builder, getPayload(S->Seg.Val));
1305 Accum =
1306 HVC.insertb(Builder, Accum, Pay, S->Seg.Start, S->Seg.Size, S->Pos);
1307 }
1308 // Instead of casting everything to bytes for the vselect, cast to the
1309 // original value type. This will avoid complications with casting masks.
1310 // For example, in cases when the original mask applied to i32, it could
1311 // be converted to a mask applicable to i8 via pred_typecast intrinsic,
1312 // but if the mask is not exactly of HVX length, extra handling would be
1313 // needed to make it work.
1314 Type *ValTy = getPayload(B.Seg.Val)->getType();
1315 Value *Cast = Builder.CreateBitCast(Accum, ValTy, "cst");
1316 Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
1317 getPassThrough(B.Seg.Val), "sel");
1318 B.Seg.Val->replaceAllUsesWith(Sel);
1319 }
1320}
1321
1322auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
1323 const ByteSpan &VSpan, int ScLen,
1324 Value *AlignVal, Value *AlignAddr) const
1325 -> void {
1326 LLVM_DEBUG(dbgs() << __func__ << "\n");
1327
1328 Type *SecTy = HVC.getByteTy(ScLen);
1329 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1330 bool DoAlign = !HVC.isZero(AlignVal);
1331
1332 // Stores.
1333 ByteSpan ASpanV, ASpanM;
1334
1335 // Return a vector value corresponding to the input value Val:
1336 // either <1 x Val> for scalar Val, or Val itself for vector Val.
1337 auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
1338 Type *Ty = Val->getType();
1339 if (Ty->isVectorTy())
1340 return Val;
1341 auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
1342 return Builder.CreateBitCast(Val, VecTy, "cst");
1343 };
1344
1345 // Create an extra "undef" sector at the beginning and at the end.
1346 // They will be used as the left/right filler in the vlalign step.
1347 for (int Index = (DoAlign ? -1 : 0); Index != NumSectors + DoAlign; ++Index) {
1348 // For stores, the size of each section is an aligned vector length.
1349 // Adjust the store offsets relative to the section start offset.
1350 ByteSpan VSection =
1351 VSpan.section(Index * ScLen, ScLen).shift(-Index * ScLen);
1352 Value *Undef = UndefValue::get(SecTy);
1353 Value *Zero = HVC.getNullValue(SecTy);
1354 Value *AccumV = Undef;
1355 Value *AccumM = Zero;
1356 for (ByteSpan::Block &S : VSection) {
1357 Value *Pay = getPayload(S.Seg.Val);
1358 Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
1359 Pay->getType(), HVC.getByteTy());
1360 Value *PartM = HVC.insertb(Builder, Zero, HVC.vbytes(Builder, Mask),
1361 S.Seg.Start, S.Seg.Size, S.Pos);
1362 AccumM = Builder.CreateOr(AccumM, PartM);
1363
1364 Value *PartV = HVC.insertb(Builder, Undef, HVC.vbytes(Builder, Pay),
1365 S.Seg.Start, S.Seg.Size, S.Pos);
1366
1367 AccumV = Builder.CreateSelect(
1368 Builder.CreateICmp(CmpInst::ICMP_NE, PartM, Zero), PartV, AccumV);
1369 }
1370 ASpanV.Blocks.emplace_back(AccumV, ScLen, Index * ScLen);
1371 ASpanM.Blocks.emplace_back(AccumM, ScLen, Index * ScLen);
1372 }
1373
1374 LLVM_DEBUG({
1375 dbgs() << "ASpanV before vlalign:\n" << ASpanV << '\n';
1376 dbgs() << "ASpanM before vlalign:\n" << ASpanM << '\n';
1377 });
1378
1379 // vlalign
1380 if (DoAlign) {
1381 for (int Index = 1; Index != NumSectors + 2; ++Index) {
1382 Value *PrevV = ASpanV[Index - 1].Seg.Val, *ThisV = ASpanV[Index].Seg.Val;
1383 Value *PrevM = ASpanM[Index - 1].Seg.Val, *ThisM = ASpanM[Index].Seg.Val;
1384 assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
1385 ASpanV[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
1386 ASpanM[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
1387 }
1388 }
1389
1390 LLVM_DEBUG({
1391 dbgs() << "ASpanV after vlalign:\n" << ASpanV << '\n';
1392 dbgs() << "ASpanM after vlalign:\n" << ASpanM << '\n';
1393 });
1394
1395 auto createStore = [&](IRBuilderBase &Builder, const ByteSpan &ASpanV,
1396 const ByteSpan &ASpanM, int Index, bool MakePred) {
1397 Value *Val = ASpanV[Index].Seg.Val;
1398 Value *Mask = ASpanM[Index].Seg.Val; // bytes
1399 if (HVC.isUndef(Val) || HVC.isZero(Mask))
1400 return;
1401 Value *Ptr =
1402 createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1403 Value *Predicate =
1404 MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1405
1406 // If vector shifting is potentially needed, accumulate metadata
1407 // from source sections of twice the store width.
1408 int Start = (Index - DoAlign) * ScLen;
1409 int Width = (1 + DoAlign) * ScLen;
1410 this->createStore(Builder, Val, Ptr, Predicate, ScLen,
1411 HVC.vlsb(Builder, Mask),
1412 VSpan.section(Start, Width).values());
1413 };
1414
1415 for (int Index = 0; Index != NumSectors + DoAlign; ++Index) {
1416 createStore(Builder, ASpanV, ASpanM, Index, DoAlign && Index == NumSectors);
1417 }
1418}
1419
1420auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
1421 LLVM_DEBUG(dbgs() << "Realigning group:\n" << Move << '\n');
1422
1423 // TODO: Needs support for masked loads/stores of "scalar" vectors.
1424 if (!Move.IsHvx)
1425 return false;
1426
1427 // Return the element with the maximum alignment from Range,
1428 // where GetValue obtains the value to compare from an element.
1429 auto getMaxOf = [](auto Range, auto GetValue) {
1430 return *llvm::max_element(Range, [&GetValue](auto &A, auto &B) {
1431 return GetValue(A) < GetValue(B);
1432 });
1433 };
1434
1435 const AddrList &BaseInfos = AddrGroups.at(Move.Base);
1436
1437 // Conceptually, there is a vector of N bytes covering the addresses
1438 // starting from the minimum offset (i.e. Base.Addr+Start). This vector
1439 // represents a contiguous memory region that spans all accessed memory
1440 // locations.
1441 // The correspondence between loaded or stored values will be expressed
1442 // in terms of this vector. For example, the 0th element of the vector
1443 // from the Base address info will start at byte Start from the beginning
1444 // of this conceptual vector.
1445 //
1446 // This vector will be loaded/stored starting at the nearest down-aligned
1447 // address and the amount od the down-alignment will be AlignVal:
1448 // valign(load_vector(align_down(Base+Start)), AlignVal)
1449
1450 std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
1451 AddrList MoveInfos;
1453 BaseInfos, std::back_inserter(MoveInfos),
1454 [&TestSet](const AddrInfo &AI) { return TestSet.count(AI.Inst); });
1455
1456 // Maximum alignment present in the whole address group.
1457 const AddrInfo &WithMaxAlign =
1458 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
1459 Align MaxGiven = WithMaxAlign.HaveAlign;
1460
1461 // Minimum alignment present in the move address group.
1462 const AddrInfo &WithMinOffset =
1463 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
1464
1465 const AddrInfo &WithMaxNeeded =
1466 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
1467 Align MinNeeded = WithMaxNeeded.NeedAlign;
1468
1469 // Set the builder's insertion point right before the load group, or
1470 // immediately after the store group. (Instructions in a store group are
1471 // listed in reverse order.)
1472 Instruction *InsertAt = Move.Main.front();
1473 if (!Move.IsLoad) {
1474 // There should be a terminator (which store isn't, but check anyways).
1475 assert(InsertAt->getIterator() != InsertAt->getParent()->end());
1476 InsertAt = &*std::next(InsertAt->getIterator());
1477 }
1478
1479 IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
1480 InstSimplifyFolder(HVC.DL));
1481 Value *AlignAddr = nullptr; // Actual aligned address.
1482 Value *AlignVal = nullptr; // Right-shift amount (for valign).
1483
1484 if (MinNeeded <= MaxGiven) {
1485 int Start = WithMinOffset.Offset;
1486 int OffAtMax = WithMaxAlign.Offset;
1487 // Shift the offset of the maximally aligned instruction (OffAtMax)
1488 // back by just enough multiples of the required alignment to cover the
1489 // distance from Start to OffAtMax.
1490 // Calculate the address adjustment amount based on the address with the
1491 // maximum alignment. This is to allow a simple gep instruction instead
1492 // of potential bitcasts to i8*.
1493 int Adjust = -alignTo(OffAtMax - Start, MinNeeded.value());
1494 AlignAddr = createAdjustedPointer(Builder, WithMaxAlign.Addr,
1495 WithMaxAlign.ValTy, Adjust, Move.Clones);
1496 int Diff = Start - (OffAtMax + Adjust);
1497 AlignVal = HVC.getConstInt(Diff);
1498 assert(Diff >= 0);
1499 assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
1500 } else {
1501 // WithMinOffset is the lowest address in the group,
1502 // WithMinOffset.Addr = Base+Start.
1503 // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
1504 // mask off unnecessary bits, so it's ok to just the original pointer as
1505 // the alignment amount.
1506 // Do an explicit down-alignment of the address to avoid creating an
1507 // aligned instruction with an address that is not really aligned.
1508 AlignAddr =
1509 createAlignedPointer(Builder, WithMinOffset.Addr, WithMinOffset.ValTy,
1510 MinNeeded.value(), Move.Clones);
1511 AlignVal =
1512 Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy(), "pti");
1513 if (auto *I = dyn_cast<Instruction>(AlignVal)) {
1514 for (auto [Old, New] : Move.Clones)
1515 I->replaceUsesOfWith(Old, New);
1516 }
1517 }
1518
1519 ByteSpan VSpan;
1520 for (const AddrInfo &AI : MoveInfos) {
1521 VSpan.Blocks.emplace_back(AI.Inst, HVC.getSizeOf(AI.ValTy),
1522 AI.Offset - WithMinOffset.Offset);
1523 }
1524
1525 // The aligned loads/stores will use blocks that are either scalars,
1526 // or HVX vectors. Let "sector" be the unified term for such a block.
1527 // blend(scalar, vector) -> sector...
1528 int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
1529 : std::max<int>(MinNeeded.value(), 4);
1530 assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
1531 assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
1532
1533 LLVM_DEBUG({
1534 dbgs() << "ScLen: " << ScLen << "\n";
1535 dbgs() << "AlignVal:" << *AlignVal << "\n";
1536 dbgs() << "AlignAddr:" << *AlignAddr << "\n";
1537 dbgs() << "VSpan:\n" << VSpan << '\n';
1538 });
1539
1540 if (Move.IsLoad)
1541 realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1542 else
1543 realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1544
1545 for (auto *Inst : Move.Main)
1546 Inst->eraseFromParent();
1547
1548 return true;
1549}
1550
1551auto AlignVectors::makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
1552 int Alignment) const -> Value * {
1553 auto *AlignTy = AlignVal->getType();
1554 Value *And = Builder.CreateAnd(
1555 AlignVal, ConstantInt::get(AlignTy, Alignment - 1), "and");
1556 Value *Zero = ConstantInt::get(AlignTy, 0);
1557 return Builder.CreateICmpNE(And, Zero, "isz");
1558}
1559
1560auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
1561 if (!HVC.isByteVecTy(Ty))
1562 return false;
1563 int Size = HVC.getSizeOf(Ty);
1564 if (HVC.HST.isTypeForHVX(Ty))
1565 return Size == static_cast<int>(HVC.HST.getVectorLength());
1566 return Size == 4 || Size == 8;
1567}
1568
1569auto AlignVectors::run() -> bool {
1570 LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()
1571 << '\n');
1572 if (!createAddressGroups())
1573 return false;
1574
1575 LLVM_DEBUG({
1576 dbgs() << "Address groups(" << AddrGroups.size() << "):\n";
1577 for (auto &[In, AL] : AddrGroups) {
1578 for (const AddrInfo &AI : AL)
1579 dbgs() << "---\n" << AI << '\n';
1580 }
1581 });
1582
1583 bool Changed = false;
1584 MoveList LoadGroups, StoreGroups;
1585
1586 for (auto &G : AddrGroups) {
1587 llvm::append_range(LoadGroups, createLoadGroups(G.second));
1588 llvm::append_range(StoreGroups, createStoreGroups(G.second));
1589 }
1590
1591 LLVM_DEBUG({
1592 dbgs() << "\nLoad groups(" << LoadGroups.size() << "):\n";
1593 for (const MoveGroup &G : LoadGroups)
1594 dbgs() << G << "\n";
1595 dbgs() << "Store groups(" << StoreGroups.size() << "):\n";
1596 for (const MoveGroup &G : StoreGroups)
1597 dbgs() << G << "\n";
1598 });
1599
1600 // Cumulative limit on the number of groups.
1601 unsigned CountLimit = VAGroupCountLimit;
1602 if (CountLimit == 0)
1603 return false;
1604
1605 if (LoadGroups.size() > CountLimit) {
1606 LoadGroups.resize(CountLimit);
1607 StoreGroups.clear();
1608 } else {
1609 unsigned StoreLimit = CountLimit - LoadGroups.size();
1610 if (StoreGroups.size() > StoreLimit)
1611 StoreGroups.resize(StoreLimit);
1612 }
1613
1614 for (auto &M : LoadGroups)
1615 Changed |= moveTogether(M);
1616 for (auto &M : StoreGroups)
1617 Changed |= moveTogether(M);
1618
1619 LLVM_DEBUG(dbgs() << "After moveTogether:\n" << HVC.F);
1620
1621 for (auto &M : LoadGroups)
1622 Changed |= realignGroup(M);
1623 for (auto &M : StoreGroups)
1624 Changed |= realignGroup(M);
1625
1626 return Changed;
1627}
1628
1629// --- End AlignVectors
1630
1631// --- Begin HvxIdioms
1632
1633auto HvxIdioms::getNumSignificantBits(Value *V, Instruction *In) const
1634 -> std::pair<unsigned, Signedness> {
1635 unsigned Bits = HVC.getNumSignificantBits(V, In);
1636 // The significant bits are calculated including the sign bit. This may
1637 // add an extra bit for zero-extended values, e.g. (zext i32 to i64) may
1638 // result in 33 significant bits. To avoid extra words, skip the extra
1639 // sign bit, but keep information that the value is to be treated as
1640 // unsigned.
1641 KnownBits Known = HVC.getKnownBits(V, In);
1642 Signedness Sign = Signed;
1643 unsigned NumToTest = 0; // Number of bits used in test for unsignedness.
1644 if (isPowerOf2_32(Bits))
1645 NumToTest = Bits;
1646 else if (Bits > 1 && isPowerOf2_32(Bits - 1))
1647 NumToTest = Bits - 1;
1648
1649 if (NumToTest != 0 && Known.Zero.ashr(NumToTest).isAllOnes()) {
1650 Sign = Unsigned;
1651 Bits = NumToTest;
1652 }
1653
1654 // If the top bit of the nearest power-of-2 is zero, this value is
1655 // positive. It could be treated as either signed or unsigned.
1656 if (unsigned Pow2 = PowerOf2Ceil(Bits); Pow2 != Bits) {
1657 if (Known.Zero.ashr(Pow2 - 1).isAllOnes())
1658 Sign = Positive;
1659 }
1660 return {Bits, Sign};
1661}
1662
1663auto HvxIdioms::canonSgn(SValue X, SValue Y) const
1664 -> std::pair<SValue, SValue> {
1665 // Canonicalize the signedness of X and Y, so that the result is one of:
1666 // S, S
1667 // U/P, S
1668 // U/P, U/P
1669 if (X.Sgn == Signed && Y.Sgn != Signed)
1670 std::swap(X, Y);
1671 return {X, Y};
1672}
1673
1674// Match
1675// (X * Y) [>> N], or
1676// ((X * Y) + (1 << M)) >> N
1677auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
1678 using namespace PatternMatch;
1679 auto *Ty = In.getType();
1680
1681 if (!Ty->isVectorTy() || !Ty->getScalarType()->isIntegerTy())
1682 return std::nullopt;
1683
1684 unsigned Width = cast<IntegerType>(Ty->getScalarType())->getBitWidth();
1685
1686 FxpOp Op;
1687 Value *Exp = &In;
1688
1689 // Fixed-point multiplication is always shifted right (except when the
1690 // fraction is 0 bits).
1691 auto m_Shr = [](auto &&V, auto &&S) {
1692 return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
1693 };
1694
1695 uint64_t Qn = 0;
1696 if (Value *T; match(Exp, m_Shr(m_Value(T), m_ConstantInt(Qn)))) {
1697 Op.Frac = Qn;
1698 Exp = T;
1699 } else {
1700 Op.Frac = 0;
1701 }
1702
1703 if (Op.Frac > Width)
1704 return std::nullopt;
1705
1706 // Check if there is rounding added.
1707 uint64_t CV;
1708 if (Value *T;
1709 Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_ConstantInt(CV)))) {
1710 if (CV != 0 && !isPowerOf2_64(CV))
1711 return std::nullopt;
1712 if (CV != 0)
1713 Op.RoundAt = Log2_64(CV);
1714 Exp = T;
1715 }
1716
1717 // Check if the rest is a multiplication.
1718 if (match(Exp, m_Mul(m_Value(Op.X.Val), m_Value(Op.Y.Val)))) {
1719 Op.Opcode = Instruction::Mul;
1720 // FIXME: The information below is recomputed.
1721 Op.X.Sgn = getNumSignificantBits(Op.X.Val, &In).second;
1722 Op.Y.Sgn = getNumSignificantBits(Op.Y.Val, &In).second;
1723 Op.ResTy = cast<VectorType>(Ty);
1724 return Op;
1725 }
1726
1727 return std::nullopt;
1728}
1729
1730auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
1731 -> Value * {
1732 assert(Op.X.Val->getType() == Op.Y.Val->getType());
1733
1734 auto *VecTy = dyn_cast<VectorType>(Op.X.Val->getType());
1735 if (VecTy == nullptr)
1736 return nullptr;
1737 auto *ElemTy = cast<IntegerType>(VecTy->getElementType());
1738 unsigned ElemWidth = ElemTy->getBitWidth();
1739
1740 // TODO: This can be relaxed after legalization is done pre-isel.
1741 if ((HVC.length(VecTy) * ElemWidth) % (8 * HVC.HST.getVectorLength()) != 0)
1742 return nullptr;
1743
1744 // There are no special intrinsics that should be used for multiplying
1745 // signed 8-bit values, so just skip them. Normal codegen should handle
1746 // this just fine.
1747 if (ElemWidth <= 8)
1748 return nullptr;
1749 // Similarly, if this is just a multiplication that can be handled without
1750 // intervention, then leave it alone.
1751 if (ElemWidth <= 32 && Op.Frac == 0)
1752 return nullptr;
1753
1754 auto [BitsX, SignX] = getNumSignificantBits(Op.X.Val, &In);
1755 auto [BitsY, SignY] = getNumSignificantBits(Op.Y.Val, &In);
1756
1757 // TODO: Add multiplication of vectors by scalar registers (up to 4 bytes).
1758
1759 Value *X = Op.X.Val, *Y = Op.Y.Val;
1760 IRBuilder Builder(In.getParent(), In.getIterator(),
1761 InstSimplifyFolder(HVC.DL));
1762
1763 auto roundUpWidth = [](unsigned Width) -> unsigned {
1764 if (Width <= 32 && !isPowerOf2_32(Width)) {
1765 // If the element width is not a power of 2, round it up
1766 // to the next one. Do this for widths not exceeding 32.
1767 return PowerOf2Ceil(Width);
1768 }
1769 if (Width > 32 && Width % 32 != 0) {
1770 // For wider elements, round it up to the multiple of 32.
1771 return alignTo(Width, 32u);
1772 }
1773 return Width;
1774 };
1775
1776 BitsX = roundUpWidth(BitsX);
1777 BitsY = roundUpWidth(BitsY);
1778
1779 // For elementwise multiplication vectors must have the same lengths, so
1780 // resize the elements of both inputs to the same width, the max of the
1781 // calculated significant bits.
1782 unsigned Width = std::max(BitsX, BitsY);
1783
1784 auto *ResizeTy = VectorType::get(HVC.getIntTy(Width), VecTy);
1785 if (Width < ElemWidth) {
1786 X = Builder.CreateTrunc(X, ResizeTy, "trn");
1787 Y = Builder.CreateTrunc(Y, ResizeTy, "trn");
1788 } else if (Width > ElemWidth) {
1789 X = SignX == Signed ? Builder.CreateSExt(X, ResizeTy, "sxt")
1790 : Builder.CreateZExt(X, ResizeTy, "zxt");
1791 Y = SignY == Signed ? Builder.CreateSExt(Y, ResizeTy, "sxt")
1792 : Builder.CreateZExt(Y, ResizeTy, "zxt");
1793 };
1794
1795 assert(X->getType() == Y->getType() && X->getType() == ResizeTy);
1796
1797 unsigned VecLen = HVC.length(ResizeTy);
1798 unsigned ChopLen = (8 * HVC.HST.getVectorLength()) / std::min(Width, 32u);
1799
1801 FxpOp ChopOp = Op;
1802 ChopOp.ResTy = VectorType::get(Op.ResTy->getElementType(), ChopLen, false);
1803
1804 for (unsigned V = 0; V != VecLen / ChopLen; ++V) {
1805 ChopOp.X.Val = HVC.subvector(Builder, X, V * ChopLen, ChopLen);
1806 ChopOp.Y.Val = HVC.subvector(Builder, Y, V * ChopLen, ChopLen);
1807 Results.push_back(processFxpMulChopped(Builder, In, ChopOp));
1808 if (Results.back() == nullptr)
1809 break;
1810 }
1811
1812 if (Results.empty() || Results.back() == nullptr)
1813 return nullptr;
1814
1815 Value *Cat = HVC.concat(Builder, Results);
1816 Value *Ext = SignX == Signed || SignY == Signed
1817 ? Builder.CreateSExt(Cat, VecTy, "sxt")
1818 : Builder.CreateZExt(Cat, VecTy, "zxt");
1819 return Ext;
1820}
1821
1822inline bool HvxIdioms::matchScatter(Instruction &In) const {
1823 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1824 if (!II)
1825 return false;
1826 return (II->getIntrinsicID() == Intrinsic::masked_scatter);
1827}
1828
1829inline bool HvxIdioms::matchGather(Instruction &In) const {
1830 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1831 if (!II)
1832 return false;
1833 return (II->getIntrinsicID() == Intrinsic::masked_gather);
1834}
1835
1836Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);
1837
1838// Binary instructions we want to handle as users of gather/scatter.
1839inline bool isArithmetic(unsigned Opc) {
1840 switch (Opc) {
1841 case Instruction::Add:
1842 case Instruction::Sub:
1843 case Instruction::Mul:
1844 case Instruction::And:
1845 case Instruction::Or:
1846 case Instruction::Xor:
1847 case Instruction::AShr:
1848 case Instruction::LShr:
1849 case Instruction::Shl:
1850 case Instruction::UDiv:
1851 return true;
1852 }
1853 return false;
1854}
1855
1856// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.
1858 assert(Ptr && "Unable to extract pointer");
1860 return Ptr;
1864 if (II->getIntrinsicID() == Intrinsic::masked_store)
1865 return II->getOperand(1);
1866 }
1867 return nullptr;
1868}
1869
1871 HvxIdioms::DstQualifier &Qual) {
1872 Instruction *Destination = nullptr;
1873 if (!In)
1874 return Destination;
1875 if (isa<StoreInst>(In)) {
1876 Destination = In;
1877 Qual = HvxIdioms::LdSt;
1878 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
1879 if (II->getIntrinsicID() == Intrinsic::masked_gather) {
1880 Destination = In;
1881 Qual = HvxIdioms::LLVM_Gather;
1882 } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {
1883 Destination = In;
1884 Qual = HvxIdioms::LLVM_Scatter;
1885 } else if (II->getIntrinsicID() == Intrinsic::masked_store) {
1886 Destination = In;
1887 Qual = HvxIdioms::LdSt;
1888 } else if (II->getIntrinsicID() ==
1889 Intrinsic::hexagon_V6_vgather_vscattermh) {
1890 Destination = In;
1891 Qual = HvxIdioms::HEX_Gather_Scatter;
1892 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {
1893 Destination = In;
1894 Qual = HvxIdioms::HEX_Scatter;
1895 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {
1896 Destination = In;
1897 Qual = HvxIdioms::HEX_Gather;
1898 }
1899 } else if (isa<ZExtInst>(In)) {
1900 return locateDestination(In, Qual);
1901 } else if (isa<CastInst>(In)) {
1902 return locateDestination(In, Qual);
1903 } else if (isa<CallInst>(In)) {
1904 Destination = In;
1905 Qual = HvxIdioms::Call;
1906 } else if (isa<GetElementPtrInst>(In)) {
1907 return locateDestination(In, Qual);
1908 } else if (isArithmetic(In->getOpcode())) {
1909 Destination = In;
1910 Qual = HvxIdioms::Arithmetic;
1911 } else {
1912 LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");
1913 }
1914 return Destination;
1915}
1916
1917// This method attempts to find destination (user) for a given intrinsic.
1918// Given that these are produced only by Ripple, the number of options is
1919// limited. Simplest case is explicit store which in fact is redundant (since
1920// HVX gater creates its own store during packetization). Nevertheless we need
1921// to figure address where we storing. Other cases are more complicated, but
1922// still few.
1923Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {
1924 Instruction *Destination = nullptr;
1925 if (!In)
1926 return Destination;
1927 // Get all possible destinations
1929 // Iterate over the uses of the instruction
1930 for (auto &U : In->uses()) {
1931 if (auto *UI = dyn_cast<Instruction>(U.getUser())) {
1932 Destination = selectDestination(UI, Qual);
1933 if (Destination)
1934 Users.push_back(Destination);
1935 }
1936 }
1937 // Now see which of the users (if any) is a memory destination.
1938 for (auto *I : Users)
1939 if (getPointer(I))
1940 return I;
1941 return Destination;
1942}
1943
1944// The two intrinsics we handle here have GEP in a different position.
1946 assert(In && "Bad instruction");
1948 assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||
1949 IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&
1950 "Not a gather Intrinsic");
1951 GetElementPtrInst *GEPIndex = nullptr;
1952 if (IIn->getIntrinsicID() == Intrinsic::masked_gather)
1953 GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0));
1954 else
1955 GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1));
1956 return GEPIndex;
1957}
1958
1959// Given the intrinsic find its GEP argument and extract base address it uses.
1960// The method relies on the way how Ripple typically forms the GEP for
1961// scatter/gather.
1964 if (!GEPIndex) {
1965 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
1966 return nullptr;
1967 }
1968 Value *BaseAddress = GEPIndex->getPointerOperand();
1969 auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress);
1970 if (IndexLoad)
1971 return IndexLoad;
1972
1973 auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress);
1974 if (IndexZEx) {
1975 IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0));
1976 if (IndexLoad)
1977 return IndexLoad;
1978 IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0));
1979 if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
1981 }
1982 auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress);
1983 if (BaseShuffle) {
1984 IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0));
1985 if (IndexLoad)
1986 return IndexLoad;
1987 auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0));
1988 if (IE) {
1989 auto *Src = IE->getOperand(1);
1990 IndexLoad = dyn_cast<LoadInst>(Src);
1991 if (IndexLoad)
1992 return IndexLoad;
1993 auto *Alloca = dyn_cast<AllocaInst>(Src);
1994 if (Alloca)
1995 return Alloca;
1996 if (isa<Argument>(Src)) {
1997 return Src;
1998 }
1999 if (isa<GlobalValue>(Src)) {
2000 return Src;
2001 }
2002 }
2003 }
2004 LLVM_DEBUG(dbgs() << " Unable to locate Address from intrinsic\n");
2005 return nullptr;
2006}
2007
2009 if (!In)
2010 return nullptr;
2011
2012 if (isa<LoadInst>(In) || isa<StoreInst>(In))
2013 return getLoadStoreType(In);
2014
2016 if (II->getIntrinsicID() == Intrinsic::masked_load)
2017 return II->getType();
2018 if (II->getIntrinsicID() == Intrinsic::masked_store)
2019 return II->getOperand(0)->getType();
2020 }
2021 return In->getType();
2022}
2023
2025 if (!In)
2026 return nullptr;
2027 if (isa<LoadInst>(In))
2028 return In;
2030 if (II->getIntrinsicID() == Intrinsic::masked_load)
2031 return In;
2032 if (II->getIntrinsicID() == Intrinsic::masked_gather)
2033 return In;
2034 }
2035 if (auto *IndexZEx = dyn_cast<ZExtInst>(In))
2036 return locateIndexesFromGEP(IndexZEx->getOperand(0));
2037 if (auto *IndexSEx = dyn_cast<SExtInst>(In))
2038 return locateIndexesFromGEP(IndexSEx->getOperand(0));
2039 if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In))
2040 return locateIndexesFromGEP(BaseShuffle->getOperand(0));
2041 if (auto *IE = dyn_cast<InsertElementInst>(In))
2042 return locateIndexesFromGEP(IE->getOperand(1));
2043 if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In))
2044 return cstDataVector;
2045 if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In))
2046 return GEPIndex->getOperand(0);
2047 return nullptr;
2048}
2049
2050// Given the intrinsic find its GEP argument and extract offsetts from the base
2051// address it uses.
2054 if (!GEPIndex) {
2055 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
2056 return nullptr;
2057 }
2058 Value *Indexes = GEPIndex->getOperand(1);
2059 if (auto *IndexLoad = locateIndexesFromGEP(Indexes))
2060 return IndexLoad;
2061
2062 LLVM_DEBUG(dbgs() << " Unable to locate Index from intrinsic\n");
2063 return nullptr;
2064}
2065
2066// Because of aukward definition of many Hex intrinsics we often have to
2067// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP
2068// for all use cases, so this only exist to make IR builder happy.
2069inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,
2070 IRBuilderBase &Builder,
2071 LLVMContext &Ctx, Value *I) {
2072 assert(I && "Unable to reinterprete cast");
2073 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2074 std::vector<unsigned> shuffleMask;
2075 for (unsigned i = 0; i < 64; ++i)
2076 shuffleMask.push_back(i);
2077 Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
2078 Value *CastShuffle =
2079 Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
2080 return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32");
2081}
2082
2083// Recast <128 x i8> as <32 x i32>
2084inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,
2085 IRBuilderBase &Builder,
2086 LLVMContext &Ctx, Value *I) {
2087 assert(I && "Unable to reinterprete cast");
2088 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2089 std::vector<unsigned> shuffleMask;
2090 for (unsigned i = 0; i < 128; ++i)
2091 shuffleMask.push_back(i);
2092 Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
2093 Value *CastShuffle =
2094 Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
2095 return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32");
2096}
2097
2098// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern
2099inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,
2100 IRBuilderBase &Builder, LLVMContext &Ctx,
2101 unsigned int pattern) {
2102 std::vector<unsigned int> byteMask;
2103 for (unsigned i = 0; i < 32; ++i)
2104 byteMask.push_back(pattern);
2105
2106 return Builder.CreateIntrinsic(
2107 HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt),
2108 {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)},
2109 nullptr);
2110}
2111
2112Value *HvxIdioms::processVScatter(Instruction &In) const {
2113 auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType());
2114 assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");
2115 unsigned InpSize = HVC.getSizeOf(InpTy);
2116 auto *F = In.getFunction();
2117 LLVMContext &Ctx = F->getContext();
2118 auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType());
2119 assert(ElemTy && "llvm.scatter needs integer type argument");
2120 unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy);
2121 LLVM_DEBUG({
2122 unsigned Elements = HVC.length(InpTy);
2123 dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n";
2124 dbgs() << " Input type(" << *InpTy << ") elements(" << Elements
2125 << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth("
2126 << ElemWidth << ")\n";
2127 });
2128
2129 IRBuilder Builder(In.getParent(), In.getIterator(),
2130 InstSimplifyFolder(HVC.DL));
2131
2132 auto *ValueToScatter = In.getOperand(0);
2133 LLVM_DEBUG(dbgs() << " ValueToScatter : " << *ValueToScatter << "\n");
2134
2135 if (HVC.HST.getVectorLength() != InpSize) {
2136 LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize
2137 << ") for vscatter\n");
2138 return nullptr;
2139 }
2140
2141 // Base address of indexes.
2142 auto *IndexLoad = locateAddressFromIntrinsic(&In);
2143 if (!IndexLoad)
2144 return nullptr;
2145 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2146
2147 // Address of destination. Must be in VTCM.
2148 auto *Ptr = getPointer(IndexLoad);
2149 if (!Ptr)
2150 return nullptr;
2151 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2152 // Indexes/offsets
2153 auto *Indexes = locateIndexesFromIntrinsic(&In);
2154 if (!Indexes)
2155 return nullptr;
2156 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2157 Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx),
2158 "cst_ptr_to_i32");
2159 LLVM_DEBUG(dbgs() << " CastedDst : " << *CastedDst << "\n");
2160 // Adjust Indexes
2161 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2162 Value *CastIndex = nullptr;
2163 if (cstDataVector) {
2164 // Our indexes are represented as a constant. We need it in a reg.
2165 AllocaInst *IndexesAlloca =
2166 Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false));
2167 [[maybe_unused]] auto *StoreIndexes =
2168 Builder.CreateStore(cstDataVector, IndexesAlloca);
2169 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2170 CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(),
2171 IndexesAlloca, "reload_index");
2172 } else {
2173 if (ElemWidth == 2)
2174 CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2175 else
2176 CastIndex = Indexes;
2177 }
2178 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2179
2180 if (ElemWidth == 1) {
2181 // v128i8 There is no native instruction for this.
2182 // Do this as two Hi/Lo gathers with masking.
2183 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2184 // Extend indexes. We assume that indexes are in 128i8 format - need to
2185 // expand them to Hi/Lo 64i16
2186 Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32");
2187 auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
2188 auto *UnpackedIndexes = Builder.CreateIntrinsic(
2189 HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr);
2190 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes << ")\n");
2191
2192 auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
2193 auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
2194 [[maybe_unused]] Value *IndexHi =
2195 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
2196 [[maybe_unused]] Value *IndexLo =
2197 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
2198 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2199 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2200 // Now unpack values to scatter
2201 Value *CastSrc =
2202 getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter);
2203 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2204 auto *UnpackedValueToScatter = Builder.CreateIntrinsic(
2205 HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr);
2206 LLVM_DEBUG(dbgs() << " UnpackedValToScat: " << *UnpackedValueToScatter
2207 << ")\n");
2208
2209 [[maybe_unused]] Value *UVSHi =
2210 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter);
2211 [[maybe_unused]] Value *UVSLo =
2212 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter);
2213 LLVM_DEBUG(dbgs() << " UVSHi : " << *UVSHi << ")\n");
2214 LLVM_DEBUG(dbgs() << " UVSLo : " << *UVSLo << ")\n");
2215
2216 // Create the mask for individual bytes
2217 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
2218 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2219 [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic(
2220 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
2221 {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2222 IndexHi, UVSHi},
2223 nullptr);
2224 LLVM_DEBUG(dbgs() << " ResHi : " << *ResHi << ")\n");
2225 return Builder.CreateIntrinsic(
2226 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
2227 {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2228 IndexLo, UVSLo},
2229 nullptr);
2230 } else if (ElemWidth == 2) {
2231 Value *CastSrc =
2232 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter);
2233 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2234 return Builder.CreateIntrinsic(
2235 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B,
2236 {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2237 CastSrc},
2238 nullptr);
2239 } else if (ElemWidth == 4) {
2240 return Builder.CreateIntrinsic(
2241 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B,
2242 {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2243 ValueToScatter},
2244 nullptr);
2245 } else {
2246 LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");
2247 return nullptr;
2248 }
2249}
2250
2251Value *HvxIdioms::processVGather(Instruction &In) const {
2252 [[maybe_unused]] auto *InpTy =
2253 dyn_cast<VectorType>(In.getOperand(0)->getType());
2254 assert(InpTy && "Cannot handle no vector type for llvm.gather");
2255 [[maybe_unused]] auto *ElemTy =
2256 dyn_cast<PointerType>(InpTy->getElementType());
2257 assert(ElemTy && "llvm.gather needs vector of ptr argument");
2258 auto *F = In.getFunction();
2259 LLVMContext &Ctx = F->getContext();
2260 LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"
2261 << *In.getParent() << "\n");
2262 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2263 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2264 << ") type(" << *ElemTy << ") Access alignment("
2265 << *In.getOperand(1) << ") AddressSpace("
2266 << ElemTy->getAddressSpace() << ")\n");
2267
2268 // TODO: Handle masking of elements.
2269 assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) &&
2270 "llvm.gather needs vector for mask");
2271 IRBuilder Builder(In.getParent(), In.getIterator(),
2272 InstSimplifyFolder(HVC.DL));
2273
2274 // See who is using the result. The difference between LLVM and HVX vgather
2275 // Intrinsic makes it impossible to handle all cases with temp storage. Alloca
2276 // in VTCM is not yet supported, so for now we just bail out for those cases.
2277 HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;
2278 Instruction *Dst = locateDestination(&In, Qual);
2279 if (!Dst) {
2280 LLVM_DEBUG(dbgs() << " Unable to locate vgather destination\n");
2281 return nullptr;
2282 }
2283 LLVM_DEBUG(dbgs() << " Destination : " << *Dst << " Qual(" << Qual
2284 << ")\n");
2285
2286 // Address of destination. Must be in VTCM.
2287 auto *Ptr = getPointer(Dst);
2288 if (!Ptr) {
2289 LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");
2290 return nullptr;
2291 }
2292
2293 // Result type. Assume it is a vector type.
2294 auto *DstType = cast<VectorType>(getIndexType(Dst));
2295 assert(DstType && "Cannot handle non vector dst type for llvm.gather");
2296
2297 // Base address for sources to be loaded
2298 auto *IndexLoad = locateAddressFromIntrinsic(&In);
2299 if (!IndexLoad)
2300 return nullptr;
2301 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2302
2303 // Gather indexes/offsets
2304 auto *Indexes = locateIndexesFromIntrinsic(&In);
2305 if (!Indexes)
2306 return nullptr;
2307 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2308
2309 Instruction *Gather = nullptr;
2310 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2311 if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {
2312 // We fully assume the address space is in VTCM. We also assume that all
2313 // pointers in Operand(0) have the same base(!).
2314 // This is the most basic case of all the above.
2315 unsigned OutputSize = HVC.getSizeOf(DstType);
2316 auto *DstElemTy = cast<IntegerType>(DstType->getElementType());
2317 unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy);
2318 LLVM_DEBUG(dbgs() << " Buffer type : " << *Ptr->getType()
2319 << " Address space ("
2320 << Ptr->getType()->getPointerAddressSpace() << ")\n"
2321 << " Result type : " << *DstType
2322 << "\n Size in bytes : " << OutputSize
2323 << " element type(" << *DstElemTy
2324 << ")\n ElemWidth : " << ElemWidth << " bytes\n");
2325
2326 auto *IndexType = cast<VectorType>(getIndexType(Indexes));
2327 assert(IndexType && "Cannot handle non vector index type for llvm.gather");
2328 unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType());
2329 LLVM_DEBUG(dbgs() << " IndexWidth(" << IndexWidth << ")\n");
2330
2331 // Intrinsic takes i32 instead of pointer so cast.
2332 Value *CastedPtr = Builder.CreateBitOrPointerCast(
2333 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2334 // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]
2335 // int_hexagon_V6_vgathermh [... , llvm_v16i32_ty]
2336 // int_hexagon_V6_vgathermh_128B [... , llvm_v32i32_ty]
2337 // int_hexagon_V6_vgathermhw [... , llvm_v32i32_ty]
2338 // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]
2339 // int_hexagon_V6_vgathermw [... , llvm_v16i32_ty]
2340 // int_hexagon_V6_vgathermw_128B [... , llvm_v32i32_ty]
2341 if (HVC.HST.getVectorLength() == OutputSize) {
2342 if (ElemWidth == 1) {
2343 // v128i8 There is no native instruction for this.
2344 // Do this as two Hi/Lo gathers with masking.
2345 // Unpack indexes. We assume that indexes are in 128i8 format - need to
2346 // expand them to Hi/Lo 64i16
2347 Value *CastIndexes =
2348 Builder.CreateBitCast(Indexes, NT, "cast_to_32i32");
2349 auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
2350 auto *UnpackedIndexes =
2351 Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true),
2352 V6_vunpack, CastIndexes, nullptr);
2353 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes
2354 << ")\n");
2355
2356 auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
2357 auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
2358 [[maybe_unused]] Value *IndexHi =
2359 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
2360 [[maybe_unused]] Value *IndexLo =
2361 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
2362 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2363 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2364 // Create the mask for individual bytes
2365 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
2366 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2367 // We use our destination allocation as a temp storage
2368 // This is unlikely to work properly for masked gather.
2369 auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq);
2370 [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic(
2371 Type::getVoidTy(Ctx), V6_vgather,
2372 {Ptr, QByteMask, CastedPtr,
2373 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},
2374 nullptr);
2375 LLVM_DEBUG(dbgs() << " GatherHi : " << *GatherHi << ")\n");
2376 // Rematerialize the result
2377 [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad(
2378 HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi");
2379 LLVM_DEBUG(dbgs() << " LoadedResultHi : " << *LoadedResultHi << "\n");
2380 // Same for the low part. Here we use Gather to return non-NULL result
2381 // from this function and continue to iterate. We also are deleting Dst
2382 // store below.
2383 Gather = Builder.CreateIntrinsic(
2384 Type::getVoidTy(Ctx), V6_vgather,
2385 {Ptr, QByteMask, CastedPtr,
2386 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},
2387 nullptr);
2388 LLVM_DEBUG(dbgs() << " GatherLo : " << *Gather << ")\n");
2389 Value *LoadedResultLo = Builder.CreateLoad(
2390 HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo");
2391 LLVM_DEBUG(dbgs() << " LoadedResultLo : " << *LoadedResultLo << "\n");
2392 // Now we have properly sized bytes in every other position
2393 // B b A a c a A b B c f F g G h H is presented as
2394 // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H
2395 // Use vpack to gather them
2396 auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb);
2397 [[maybe_unused]] auto Res = Builder.CreateIntrinsic(
2398 NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr);
2399 LLVM_DEBUG(dbgs() << " ScaledRes : " << *Res << "\n");
2400 [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr);
2401 LLVM_DEBUG(dbgs() << " StoreRes : " << *StoreRes << "\n");
2402 } else if (ElemWidth == 2) {
2403 // v32i16
2404 if (IndexWidth == 2) {
2405 // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.
2406 Value *CastIndex =
2407 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2408 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2409 // shift all i16 left by 1 to match short addressing mode instead of
2410 // byte.
2411 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2412 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2413 Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
2415 << " Shifted half index: " << *AdjustedIndex << ")\n");
2416
2417 auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh);
2418 // The 3rd argument is the size of the region to gather from. Probably
2419 // want to set it to max VTCM size.
2420 Gather = Builder.CreateIntrinsic(
2421 Type::getVoidTy(Ctx), V6_vgather,
2422 {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2423 AdjustedIndex},
2424 nullptr);
2425 for (auto &U : Dst->uses()) {
2426 if (auto *UI = dyn_cast<Instruction>(U.getUser()))
2427 dbgs() << " dst used by: " << *UI << "\n";
2428 }
2429 for (auto &U : In.uses()) {
2430 if (auto *UI = dyn_cast<Instruction>(U.getUser()))
2431 dbgs() << " In used by : " << *UI << "\n";
2432 }
2433 // Create temp load from result in case the result is used by any
2434 // other instruction.
2435 Value *LoadedResult = Builder.CreateLoad(
2436 HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result");
2437 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2438 In.replaceAllUsesWith(LoadedResult);
2439 } else {
2440 dbgs() << " Unhandled index type for vgather\n";
2441 return nullptr;
2442 }
2443 } else if (ElemWidth == 4) {
2444 if (IndexWidth == 4) {
2445 // v32i32
2446 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2447 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2448 Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)});
2450 << " Shifted word index: " << *AdjustedIndex << ")\n");
2451 Gather = Builder.CreateIntrinsic(
2452 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B,
2453 {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2454 AdjustedIndex},
2455 nullptr);
2456 } else {
2457 LLVM_DEBUG(dbgs() << " Unhandled index type for vgather\n");
2458 return nullptr;
2459 }
2460 } else {
2461 LLVM_DEBUG(dbgs() << " Unhandled element type for vgather\n");
2462 return nullptr;
2463 }
2464 } else if (HVC.HST.getVectorLength() == OutputSize * 2) {
2465 // This is half of the reg width, duplicate low in high
2466 LLVM_DEBUG(dbgs() << " Unhandled half of register size\n");
2467 return nullptr;
2468 } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {
2469 LLVM_DEBUG(dbgs() << " Unhandle twice the register size\n");
2470 return nullptr;
2471 }
2472 // Erase the original intrinsic and store that consumes it.
2473 // HVX will create a pseudo for gather that is expanded to gather + store
2474 // during packetization.
2475 Dst->eraseFromParent();
2476 } else if (Qual == HvxIdioms::LLVM_Scatter) {
2477 // Gather feeds directly into scatter.
2478 LLVM_DEBUG({
2479 auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
2480 assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
2481 unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
2482 unsigned DstElements = HVC.length(DstInpTy);
2483 auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType());
2484 assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
2485 dbgs() << " Gather feeds into scatter\n Values to scatter : "
2486 << *Dst->getOperand(0) << "\n";
2487 dbgs() << " Dst type(" << *DstInpTy << ") elements(" << DstElements
2488 << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy
2489 << ") Access alignment(" << *Dst->getOperand(2) << ")\n";
2490 });
2491 // Address of source
2492 auto *Src = getPointer(IndexLoad);
2493 if (!Src)
2494 return nullptr;
2495 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2496
2497 if (!isa<PointerType>(Src->getType())) {
2498 LLVM_DEBUG(dbgs() << " Source is not a pointer type...\n");
2499 return nullptr;
2500 }
2501
2502 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2503 Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2504 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2505
2506 auto *DstLoad = locateAddressFromIntrinsic(Dst);
2507 if (!DstLoad) {
2508 LLVM_DEBUG(dbgs() << " Unable to locate DstLoad\n");
2509 return nullptr;
2510 }
2511 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2512
2513 Value *Ptr = getPointer(DstLoad);
2514 if (!Ptr)
2515 return nullptr;
2516 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2517 Value *CastIndex =
2518 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad);
2519 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2520 // Shift all i16 left by 1 to match short addressing mode instead of
2521 // byte.
2522 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2523 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2524 Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
2525 LLVM_DEBUG(dbgs() << " Shifted half index: " << *AdjustedIndex << ")\n");
2526
2527 return Builder.CreateIntrinsic(
2528 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2529 {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2530 AdjustedIndex},
2531 nullptr);
2532 } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {
2533 // Gather feeds into previously inserted pseudo intrinsic.
2534 // These could not be in the same packet, so we need to generate another
2535 // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo
2536 // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,
2537 // ModRegs:$Mu, HvxVR:$Vv)
2538 if (isa<AllocaInst>(IndexLoad)) {
2539 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2540 if (cstDataVector) {
2541 // Our indexes are represented as a constant. We need THEM in a reg.
2542 // This most likely will not work properly since alloca gives us DDR
2543 // stack location. This will be fixed once we teach compiler about VTCM.
2544 AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
2545 [[maybe_unused]] auto *StoreIndexes =
2546 Builder.CreateStore(cstDataVector, IndexesAlloca);
2547 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2548 Value *LoadedIndex = Builder.CreateLoad(
2549 IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
2550 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2551 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca << "\n");
2552
2553 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2554 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2555 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2556
2557 Gather = Builder.CreateIntrinsic(
2558 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2559 {ResultAlloca, CastedSrc,
2560 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2561 nullptr);
2562 Value *LoadedResult = Builder.CreateLoad(
2563 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2564 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2565 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2566 In.replaceAllUsesWith(LoadedResult);
2567 }
2568 } else {
2569 // Address of source
2570 auto *Src = getPointer(IndexLoad);
2571 if (!Src)
2572 return nullptr;
2573 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2574
2575 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2576 Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2577 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2578
2579 auto *DstLoad = locateAddressFromIntrinsic(Dst);
2580 if (!DstLoad)
2581 return nullptr;
2582 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2583 auto *Ptr = getPointer(DstLoad);
2584 if (!Ptr)
2585 return nullptr;
2586 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2587
2588 Gather = Builder.CreateIntrinsic(
2589 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh,
2590 {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2591 Indexes},
2592 nullptr);
2593 }
2594 return Gather;
2595 } else if (Qual == HvxIdioms::HEX_Scatter) {
2596 // This is the case when result of a gather is used as an argument to
2597 // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it
2598 // ourselves. We have to create alloca, store to it, and replace all uses
2599 // with that.
2600 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2601 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2602 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2603 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2604 Value *CastIndex =
2605 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2606 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2607
2608 Gather = Builder.CreateIntrinsic(
2609 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2610 {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2611 CastIndex},
2612 nullptr);
2613 Value *LoadedResult = Builder.CreateLoad(
2614 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2615 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2616 In.replaceAllUsesWith(LoadedResult);
2617 } else if (Qual == HvxIdioms::HEX_Gather) {
2618 // Gather feeds to another gather but already replaced with
2619 // hexagon_V6_vgathermh_128B
2620 if (isa<AllocaInst>(IndexLoad)) {
2621 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2622 if (cstDataVector) {
2623 // Our indexes are represented as a constant. We need it in a reg.
2624 AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
2625
2626 [[maybe_unused]] auto *StoreIndexes =
2627 Builder.CreateStore(cstDataVector, IndexesAlloca);
2628 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2629 Value *LoadedIndex = Builder.CreateLoad(
2630 IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
2631 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2632 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca
2633 << "\n AddressSpace: "
2634 << ResultAlloca->getAddressSpace() << "\n";);
2635
2636 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2637 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2638 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2639
2640 Gather = Builder.CreateIntrinsic(
2641 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2642 {ResultAlloca, CastedSrc,
2643 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2644 nullptr);
2645 Value *LoadedResult = Builder.CreateLoad(
2646 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2647 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2648 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2649 In.replaceAllUsesWith(LoadedResult);
2650 }
2651 }
2652 } else if (Qual == HvxIdioms::LLVM_Gather) {
2653 // Gather feeds into another gather
2654 errs() << " Underimplemented vgather to vgather sequence\n";
2655 return nullptr;
2656 } else
2657 llvm_unreachable("Unhandled Qual enum");
2658
2659 return Gather;
2660}
2661
2662auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
2663 const FxpOp &Op) const -> Value * {
2664 assert(Op.X.Val->getType() == Op.Y.Val->getType());
2665 auto *InpTy = cast<VectorType>(Op.X.Val->getType());
2666 unsigned Width = InpTy->getScalarSizeInBits();
2667 bool Rounding = Op.RoundAt.has_value();
2668
2669 if (!Op.RoundAt || *Op.RoundAt == Op.Frac - 1) {
2670 // The fixed-point intrinsics do signed multiplication.
2671 if (Width == Op.Frac + 1 && Op.X.Sgn != Unsigned && Op.Y.Sgn != Unsigned) {
2672 Value *QMul = nullptr;
2673 if (Width == 16) {
2674 QMul = createMulQ15(Builder, Op.X, Op.Y, Rounding);
2675 } else if (Width == 32) {
2676 QMul = createMulQ31(Builder, Op.X, Op.Y, Rounding);
2677 }
2678 if (QMul != nullptr)
2679 return QMul;
2680 }
2681 }
2682
2683 assert(Width >= 32 || isPowerOf2_32(Width)); // Width <= 32 => Width is 2^n
2684 assert(Width < 32 || Width % 32 == 0); // Width > 32 => Width is 32*k
2685
2686 // If Width < 32, then it should really be 16.
2687 if (Width < 32) {
2688 if (Width < 16)
2689 return nullptr;
2690 // Getting here with Op.Frac == 0 isn't wrong, but suboptimal: here we
2691 // generate a full precision products, which is unnecessary if there is
2692 // no shift.
2693 assert(Width == 16);
2694 assert(Op.Frac != 0 && "Unshifted mul should have been skipped");
2695 if (Op.Frac == 16) {
2696 // Multiply high
2697 if (Value *MulH = createMulH16(Builder, Op.X, Op.Y))
2698 return MulH;
2699 }
2700 // Do full-precision multiply and shift.
2701 Value *Prod32 = createMul16(Builder, Op.X, Op.Y);
2702 if (Rounding) {
2703 Value *RoundVal = HVC.getConstSplat(Prod32->getType(), 1 << *Op.RoundAt);
2704 Prod32 = Builder.CreateAdd(Prod32, RoundVal, "add");
2705 }
2706
2707 Value *ShiftAmt = HVC.getConstSplat(Prod32->getType(), Op.Frac);
2708 Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
2709 ? Builder.CreateAShr(Prod32, ShiftAmt, "asr")
2710 : Builder.CreateLShr(Prod32, ShiftAmt, "lsr");
2711 return Builder.CreateTrunc(Shifted, InpTy, "trn");
2712 }
2713
2714 // Width >= 32
2715
2716 // Break up the arguments Op.X and Op.Y into vectors of smaller widths
2717 // in preparation of doing the multiplication by 32-bit parts.
2718 auto WordX = HVC.splitVectorElements(Builder, Op.X.Val, /*ToWidth=*/32);
2719 auto WordY = HVC.splitVectorElements(Builder, Op.Y.Val, /*ToWidth=*/32);
2720 auto WordP = createMulLong(Builder, WordX, Op.X.Sgn, WordY, Op.Y.Sgn);
2721
2722 auto *HvxWordTy = cast<VectorType>(WordP.front()->getType());
2723
2724 // Add the optional rounding to the proper word.
2725 if (Op.RoundAt.has_value()) {
2726 Value *Zero = HVC.getNullValue(WordX[0]->getType());
2727 SmallVector<Value *> RoundV(WordP.size(), Zero);
2728 RoundV[*Op.RoundAt / 32] =
2729 HVC.getConstSplat(HvxWordTy, 1 << (*Op.RoundAt % 32));
2730 WordP = createAddLong(Builder, WordP, RoundV);
2731 }
2732
2733 // createRightShiftLong?
2734
2735 // Shift all products right by Op.Frac.
2736 unsigned SkipWords = Op.Frac / 32;
2737 Constant *ShiftAmt = HVC.getConstSplat(HvxWordTy, Op.Frac % 32);
2738
2739 for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {
2740 int Src = Dst + SkipWords;
2741 Value *Lo = WordP[Src];
2742 if (Src + 1 < End) {
2743 Value *Hi = WordP[Src + 1];
2744 WordP[Dst] = Builder.CreateIntrinsic(HvxWordTy, Intrinsic::fshr,
2745 {Hi, Lo, ShiftAmt},
2746 /*FMFSource*/ nullptr, "int");
2747 } else {
2748 // The shift of the most significant word.
2749 WordP[Dst] = Builder.CreateAShr(Lo, ShiftAmt, "asr");
2750 }
2751 }
2752 if (SkipWords != 0)
2753 WordP.resize(WordP.size() - SkipWords);
2754
2755 return HVC.joinVectorElements(Builder, WordP, Op.ResTy);
2756}
2757
2758auto HvxIdioms::createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
2759 bool Rounding) const -> Value * {
2760 assert(X.Val->getType() == Y.Val->getType());
2761 assert(X.Val->getType()->getScalarType() == HVC.getIntTy(16));
2762 assert(HVC.HST.isHVXVectorType(EVT::getEVT(X.Val->getType(), false)));
2763
2764 // There is no non-rounding intrinsic for i16.
2765 if (!Rounding || X.Sgn == Unsigned || Y.Sgn == Unsigned)
2766 return nullptr;
2767
2768 auto V6_vmpyhvsrs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhvsrs);
2769 return HVC.createHvxIntrinsic(Builder, V6_vmpyhvsrs, X.Val->getType(),
2770 {X.Val, Y.Val});
2771}
2772
2773auto HvxIdioms::createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
2774 bool Rounding) const -> Value * {
2775 Type *InpTy = X.Val->getType();
2776 assert(InpTy == Y.Val->getType());
2777 assert(InpTy->getScalarType() == HVC.getIntTy(32));
2778 assert(HVC.HST.isHVXVectorType(EVT::getEVT(InpTy, false)));
2779
2780 if (X.Sgn == Unsigned || Y.Sgn == Unsigned)
2781 return nullptr;
2782
2783 auto V6_vmpyewuh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyewuh);
2784 auto V6_vmpyo_acc = Rounding
2785 ? HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_rnd_sacc)
2786 : HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_sacc);
2787 Value *V1 =
2788 HVC.createHvxIntrinsic(Builder, V6_vmpyewuh, InpTy, {X.Val, Y.Val});
2789 return HVC.createHvxIntrinsic(Builder, V6_vmpyo_acc, InpTy,
2790 {V1, X.Val, Y.Val});
2791}
2792
2793auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
2794 Value *CarryIn) const
2795 -> std::pair<Value *, Value *> {
2796 assert(X->getType() == Y->getType());
2797 auto VecTy = cast<VectorType>(X->getType());
2798 if (VecTy == HvxI32Ty && HVC.HST.useHVXV62Ops()) {
2800 Intrinsic::ID AddCarry;
2801 if (CarryIn == nullptr && HVC.HST.useHVXV66Ops()) {
2802 AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarryo);
2803 } else {
2804 AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarry);
2805 if (CarryIn == nullptr)
2806 CarryIn = HVC.getNullValue(HVC.getBoolTy(HVC.length(VecTy)));
2807 Args.push_back(CarryIn);
2808 }
2809 Value *Ret = HVC.createHvxIntrinsic(Builder, AddCarry,
2810 /*RetTy=*/nullptr, Args);
2811 Value *Result = Builder.CreateExtractValue(Ret, {0}, "ext");
2812 Value *CarryOut = Builder.CreateExtractValue(Ret, {1}, "ext");
2813 return {Result, CarryOut};
2814 }
2815
2816 // In other cases, do a regular add, and unsigned compare-less-than.
2817 // The carry-out can originate in two places: adding the carry-in or adding
2818 // the two input values.
2819 Value *Result1 = X; // Result1 = X + CarryIn
2820 if (CarryIn != nullptr) {
2821 unsigned Width = VecTy->getScalarSizeInBits();
2822 uint32_t Mask = 1;
2823 if (Width < 32) {
2824 for (unsigned i = 0, e = 32 / Width; i != e; ++i)
2825 Mask = (Mask << Width) | 1;
2826 }
2827 auto V6_vandqrt = HVC.HST.getIntrinsicId(Hexagon::V6_vandqrt);
2828 Value *ValueIn =
2829 HVC.createHvxIntrinsic(Builder, V6_vandqrt, /*RetTy=*/nullptr,
2830 {CarryIn, HVC.getConstInt(Mask)});
2831 Result1 = Builder.CreateAdd(X, ValueIn, "add");
2832 }
2833
2834 Value *CarryOut1 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result1, X, "cmp");
2835 Value *Result2 = Builder.CreateAdd(Result1, Y, "add");
2836 Value *CarryOut2 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result2, Y, "cmp");
2837 return {Result2, Builder.CreateOr(CarryOut1, CarryOut2, "orb")};
2838}
2839
2840auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const
2841 -> Value * {
2842 Intrinsic::ID V6_vmpyh = 0;
2843 std::tie(X, Y) = canonSgn(X, Y);
2844
2845 if (X.Sgn == Signed) {
2846 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhv);
2847 } else if (Y.Sgn == Signed) {
2848 // In vmpyhus the second operand is unsigned
2849 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhus);
2850 } else {
2851 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhv);
2852 }
2853
2854 // i16*i16 -> i32 / interleaved
2855 Value *P =
2856 HVC.createHvxIntrinsic(Builder, V6_vmpyh, HvxP32Ty, {Y.Val, X.Val});
2857 // Deinterleave
2858 return HVC.vshuff(Builder, HVC.sublo(Builder, P), HVC.subhi(Builder, P));
2859}
2860
2861auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
2862 -> Value * {
2863 Type *HvxI16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/false);
2864
2865 if (HVC.HST.useHVXV69Ops()) {
2866 if (X.Sgn != Signed && Y.Sgn != Signed) {
2867 auto V6_vmpyuhvs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhvs);
2868 return HVC.createHvxIntrinsic(Builder, V6_vmpyuhvs, HvxI16Ty,
2869 {X.Val, Y.Val});
2870 }
2871 }
2872
2873 Type *HvxP16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/true);
2874 Value *Pair16 =
2875 Builder.CreateBitCast(createMul16(Builder, X, Y), HvxP16Ty, "cst");
2876 unsigned Len = HVC.length(HvxP16Ty) / 2;
2877
2878 SmallVector<int, 128> PickOdd(Len);
2879 for (int i = 0; i != static_cast<int>(Len); ++i)
2880 PickOdd[i] = 2 * i + 1;
2881
2882 return Builder.CreateShuffleVector(
2883 HVC.sublo(Builder, Pair16), HVC.subhi(Builder, Pair16), PickOdd, "shf");
2884}
2885
2886auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
2887 -> std::pair<Value *, Value *> {
2888 assert(X.Val->getType() == Y.Val->getType());
2889 assert(X.Val->getType() == HvxI32Ty);
2890
2891 Intrinsic::ID V6_vmpy_parts;
2892 std::tie(X, Y) = canonSgn(X, Y);
2893
2894 if (X.Sgn == Signed) {
2895 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyss_parts;
2896 } else if (Y.Sgn == Signed) {
2897 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyus_parts;
2898 } else {
2899 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyuu_parts;
2900 }
2901
2902 Value *Parts = HVC.createHvxIntrinsic(Builder, V6_vmpy_parts, nullptr,
2903 {X.Val, Y.Val}, {HvxI32Ty});
2904 Value *Hi = Builder.CreateExtractValue(Parts, {0}, "ext");
2905 Value *Lo = Builder.CreateExtractValue(Parts, {1}, "ext");
2906 return {Lo, Hi};
2907}
2908
2909auto HvxIdioms::createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
2910 ArrayRef<Value *> WordY) const
2912 assert(WordX.size() == WordY.size());
2913 unsigned Idx = 0, Length = WordX.size();
2915
2916 while (Idx != Length) {
2917 if (HVC.isZero(WordX[Idx]))
2918 Sum[Idx] = WordY[Idx];
2919 else if (HVC.isZero(WordY[Idx]))
2920 Sum[Idx] = WordX[Idx];
2921 else
2922 break;
2923 ++Idx;
2924 }
2925
2926 Value *Carry = nullptr;
2927 for (; Idx != Length; ++Idx) {
2928 std::tie(Sum[Idx], Carry) =
2929 createAddCarry(Builder, WordX[Idx], WordY[Idx], Carry);
2930 }
2931
2932 // This drops the final carry beyond the highest word.
2933 return Sum;
2934}
2935
2936auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
2937 Signedness SgnX, ArrayRef<Value *> WordY,
2938 Signedness SgnY) const -> SmallVector<Value *> {
2939 SmallVector<SmallVector<Value *>> Products(WordX.size() + WordY.size());
2940
2941 // WordX[i] * WordY[j] produces words i+j and i+j+1 of the results,
2942 // that is halves 2(i+j), 2(i+j)+1, 2(i+j)+2, 2(i+j)+3.
2943 for (int i = 0, e = WordX.size(); i != e; ++i) {
2944 for (int j = 0, f = WordY.size(); j != f; ++j) {
2945 // Check the 4 halves that this multiplication can generate.
2946 Signedness SX = (i + 1 == e) ? SgnX : Unsigned;
2947 Signedness SY = (j + 1 == f) ? SgnY : Unsigned;
2948 auto [Lo, Hi] = createMul32(Builder, {WordX[i], SX}, {WordY[j], SY});
2949 Products[i + j + 0].push_back(Lo);
2950 Products[i + j + 1].push_back(Hi);
2951 }
2952 }
2953
2954 Value *Zero = HVC.getNullValue(WordX[0]->getType());
2955
2956 auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {
2957 if (Vector.empty())
2958 return Zero;
2959 auto Last = Vector.back();
2960 Vector.pop_back();
2961 return Last;
2962 };
2963
2964 for (int i = 0, e = Products.size(); i != e; ++i) {
2965 while (Products[i].size() > 1) {
2966 Value *Carry = nullptr; // no carry-in
2967 for (int j = i; j != e; ++j) {
2968 auto &ProdJ = Products[j];
2969 auto [Sum, CarryOut] = createAddCarry(Builder, pop_back_or_zero(ProdJ),
2970 pop_back_or_zero(ProdJ), Carry);
2971 ProdJ.insert(ProdJ.begin(), Sum);
2972 Carry = CarryOut;
2973 }
2974 }
2975 }
2976
2978 for (auto &P : Products) {
2979 assert(P.size() == 1 && "Should have been added together");
2980 WordP.push_back(P.front());
2981 }
2982
2983 return WordP;
2984}
2985
2986auto HvxIdioms::run() -> bool {
2987 bool Changed = false;
2988
2989 for (BasicBlock &B : HVC.F) {
2990 for (auto It = B.rbegin(); It != B.rend(); ++It) {
2991 if (auto Fxm = matchFxpMul(*It)) {
2992 Value *New = processFxpMul(*It, *Fxm);
2993 // Always report "changed" for now.
2994 Changed = true;
2995 if (!New)
2996 continue;
2997 bool StartOver = !isa<Instruction>(New);
2998 It->replaceAllUsesWith(New);
3000 It = StartOver ? B.rbegin()
3001 : cast<Instruction>(New)->getReverseIterator();
3002 Changed = true;
3003 } else if (matchGather(*It)) {
3004 Value *New = processVGather(*It);
3005 if (!New)
3006 continue;
3007 LLVM_DEBUG(dbgs() << " Gather : " << *New << "\n");
3008 // We replace original intrinsic with a new pseudo call.
3009 It->eraseFromParent();
3010 It = cast<Instruction>(New)->getReverseIterator();
3012 Changed = true;
3013 } else if (matchScatter(*It)) {
3014 Value *New = processVScatter(*It);
3015 if (!New)
3016 continue;
3017 LLVM_DEBUG(dbgs() << " Scatter : " << *New << "\n");
3018 // We replace original intrinsic with a new pseudo call.
3019 It->eraseFromParent();
3020 It = cast<Instruction>(New)->getReverseIterator();
3022 Changed = true;
3023 }
3024 }
3025 }
3026
3027 return Changed;
3028}
3029
3030// --- End HvxIdioms
3031
3032auto HexagonVectorCombine::run() -> bool {
3033 if (DumpModule)
3034 dbgs() << "Module before HexagonVectorCombine\n" << *F.getParent();
3035
3036 bool Changed = false;
3037 if (HST.useHVXOps()) {
3038 if (VAEnabled)
3039 Changed |= AlignVectors(*this).run();
3040 if (VIEnabled)
3041 Changed |= HvxIdioms(*this).run();
3042 }
3043
3044 if (DumpModule) {
3045 dbgs() << "Module " << (Changed ? "(modified)" : "(unchanged)")
3046 << " after HexagonVectorCombine\n"
3047 << *F.getParent();
3048 }
3049 return Changed;
3050}
3051
3052auto HexagonVectorCombine::getIntTy(unsigned Width) const -> IntegerType * {
3053 return IntegerType::get(F.getContext(), Width);
3054}
3055
3056auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
3057 assert(ElemCount >= 0);
3058 IntegerType *ByteTy = Type::getInt8Ty(F.getContext());
3059 if (ElemCount == 0)
3060 return ByteTy;
3061 return VectorType::get(ByteTy, ElemCount, /*Scalable=*/false);
3062}
3063
3064auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
3065 assert(ElemCount >= 0);
3066 IntegerType *BoolTy = Type::getInt1Ty(F.getContext());
3067 if (ElemCount == 0)
3068 return BoolTy;
3069 return VectorType::get(BoolTy, ElemCount, /*Scalable=*/false);
3070}
3071
3072auto HexagonVectorCombine::getConstInt(int Val, unsigned Width) const
3073 -> ConstantInt * {
3074 return ConstantInt::getSigned(getIntTy(Width), Val);
3075}
3076
3077auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
3078 if (auto *C = dyn_cast<Constant>(Val))
3079 return C->isZeroValue();
3080 return false;
3081}
3082
3083auto HexagonVectorCombine::getIntValue(const Value *Val) const
3084 -> std::optional<APInt> {
3085 if (auto *CI = dyn_cast<ConstantInt>(Val))
3086 return CI->getValue();
3087 return std::nullopt;
3088}
3089
3090auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
3091 return isa<UndefValue>(Val);
3092}
3093
3094auto HexagonVectorCombine::isTrue(const Value *Val) const -> bool {
3095 return Val == ConstantInt::getTrue(Val->getType());
3096}
3097
3098auto HexagonVectorCombine::isFalse(const Value *Val) const -> bool {
3099 return isZero(Val);
3100}
3101
3102auto HexagonVectorCombine::getHvxTy(Type *ElemTy, bool Pair) const
3103 -> VectorType * {
3104 EVT ETy = EVT::getEVT(ElemTy, false);
3105 assert(ETy.isSimple() && "Invalid HVX element type");
3106 // Do not allow boolean types here: they don't have a fixed length.
3107 assert(HST.isHVXElementType(ETy.getSimpleVT(), /*IncludeBool=*/false) &&
3108 "Invalid HVX element type");
3109 unsigned HwLen = HST.getVectorLength();
3110 unsigned NumElems = (8 * HwLen) / ETy.getSizeInBits();
3111 return VectorType::get(ElemTy, Pair ? 2 * NumElems : NumElems,
3112 /*Scalable=*/false);
3113}
3114
3115auto HexagonVectorCombine::getSizeOf(const Value *Val, SizeKind Kind) const
3116 -> int {
3117 return getSizeOf(Val->getType(), Kind);
3118}
3119
3120auto HexagonVectorCombine::getSizeOf(const Type *Ty, SizeKind Kind) const
3121 -> int {
3122 auto *NcTy = const_cast<Type *>(Ty);
3123 switch (Kind) {
3124 case Store:
3125 return DL.getTypeStoreSize(NcTy).getFixedValue();
3126 case Alloc:
3127 return DL.getTypeAllocSize(NcTy).getFixedValue();
3128 }
3129 llvm_unreachable("Unhandled SizeKind enum");
3130}
3131
3132auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
3133 // The actual type may be shorter than the HVX vector, so determine
3134 // the alignment based on subtarget info.
3135 if (HST.isTypeForHVX(Ty))
3136 return HST.getVectorLength();
3137 return DL.getABITypeAlign(Ty).value();
3138}
3139
3140auto HexagonVectorCombine::length(Value *Val) const -> size_t {
3141 return length(Val->getType());
3142}
3143
3144auto HexagonVectorCombine::length(Type *Ty) const -> size_t {
3145 auto *VecTy = dyn_cast<VectorType>(Ty);
3146 assert(VecTy && "Must be a vector type");
3147 return VecTy->getElementCount().getFixedValue();
3148}
3149
3150auto HexagonVectorCombine::getNullValue(Type *Ty) const -> Constant * {
3152 auto Zero = ConstantInt::get(Ty->getScalarType(), 0);
3153 if (auto *VecTy = dyn_cast<VectorType>(Ty))
3154 return ConstantVector::getSplat(VecTy->getElementCount(), Zero);
3155 return Zero;
3156}
3157
3158auto HexagonVectorCombine::getFullValue(Type *Ty) const -> Constant * {
3160 auto Minus1 = ConstantInt::get(Ty->getScalarType(), -1);
3161 if (auto *VecTy = dyn_cast<VectorType>(Ty))
3162 return ConstantVector::getSplat(VecTy->getElementCount(), Minus1);
3163 return Minus1;
3164}
3165
3166auto HexagonVectorCombine::getConstSplat(Type *Ty, int Val) const
3167 -> Constant * {
3168 assert(Ty->isVectorTy());
3169 auto VecTy = cast<VectorType>(Ty);
3170 Type *ElemTy = VecTy->getElementType();
3171 // Add support for floats if needed.
3172 auto *Splat = ConstantVector::getSplat(VecTy->getElementCount(),
3173 ConstantInt::get(ElemTy, Val));
3174 return Splat;
3175}
3176
3177auto HexagonVectorCombine::simplify(Value *V) const -> Value * {
3178 if (auto *In = dyn_cast<Instruction>(V)) {
3179 SimplifyQuery Q(DL, &TLI, &DT, &AC, In);
3180 return simplifyInstruction(In, Q);
3181 }
3182 return nullptr;
3183}
3184
3185// Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
3186auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,
3187 Value *Src, int Start, int Length,
3188 int Where) const -> Value * {
3189 assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
3190 int SrcLen = getSizeOf(Src);
3191 int DstLen = getSizeOf(Dst);
3192 assert(0 <= Start && Start + Length <= SrcLen);
3193 assert(0 <= Where && Where + Length <= DstLen);
3194
3195 int P2Len = PowerOf2Ceil(SrcLen | DstLen);
3196 auto *Poison = PoisonValue::get(getByteTy());
3197 Value *P2Src = vresize(Builder, Src, P2Len, Poison);
3198 Value *P2Dst = vresize(Builder, Dst, P2Len, Poison);
3199
3200 SmallVector<int, 256> SMask(P2Len);
3201 for (int i = 0; i != P2Len; ++i) {
3202 // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
3203 // Otherwise, pick Dst[i];
3204 SMask[i] =
3205 (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
3206 }
3207
3208 Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask, "shf");
3209 return vresize(Builder, P2Insert, DstLen, Poison);
3210}
3211
3212auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,
3213 Value *Hi, Value *Amt) const -> Value * {
3214 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3215 if (isZero(Amt))
3216 return Hi;
3217 int VecLen = getSizeOf(Hi);
3218 if (auto IntAmt = getIntValue(Amt))
3219 return getElementRange(Builder, Lo, Hi, VecLen - IntAmt->getSExtValue(),
3220 VecLen);
3221
3222 if (HST.isTypeForHVX(Hi->getType())) {
3223 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3224 "Expecting an exact HVX type");
3225 return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_vlalignb),
3226 Hi->getType(), {Hi, Lo, Amt});
3227 }
3228
3229 if (VecLen == 4) {
3230 Value *Pair = concat(Builder, {Lo, Hi});
3231 Value *Shift =
3232 Builder.CreateLShr(Builder.CreateShl(Pair, Amt, "shl"), 32, "lsr");
3233 Value *Trunc =
3234 Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
3235 return Builder.CreateBitCast(Trunc, Hi->getType(), "cst");
3236 }
3237 if (VecLen == 8) {
3238 Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt, "sub");
3239 return vralignb(Builder, Lo, Hi, Sub);
3240 }
3241 llvm_unreachable("Unexpected vector length");
3242}
3243
3244auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
3245 Value *Hi, Value *Amt) const -> Value * {
3246 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3247 if (isZero(Amt))
3248 return Lo;
3249 int VecLen = getSizeOf(Lo);
3250 if (auto IntAmt = getIntValue(Amt))
3251 return getElementRange(Builder, Lo, Hi, IntAmt->getSExtValue(), VecLen);
3252
3253 if (HST.isTypeForHVX(Lo->getType())) {
3254 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3255 "Expecting an exact HVX type");
3256 return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_valignb),
3257 Lo->getType(), {Hi, Lo, Amt});
3258 }
3259
3260 if (VecLen == 4) {
3261 Value *Pair = concat(Builder, {Lo, Hi});
3262 Value *Shift = Builder.CreateLShr(Pair, Amt, "lsr");
3263 Value *Trunc =
3264 Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
3265 return Builder.CreateBitCast(Trunc, Lo->getType(), "cst");
3266 }
3267 if (VecLen == 8) {
3268 Type *Int64Ty = Type::getInt64Ty(F.getContext());
3269 Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst");
3270 Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst");
3271 Value *Call = Builder.CreateIntrinsic(Intrinsic::hexagon_S2_valignrb,
3272 {Hi64, Lo64, Amt},
3273 /*FMFSource=*/nullptr, "cup");
3274 return Builder.CreateBitCast(Call, Lo->getType(), "cst");
3275 }
3276 llvm_unreachable("Unexpected vector length");
3277}
3278
3279// Concatenates a sequence of vectors of the same type.
3280auto HexagonVectorCombine::concat(IRBuilderBase &Builder,
3281 ArrayRef<Value *> Vecs) const -> Value * {
3282 assert(!Vecs.empty());
3284 std::vector<Value *> Work[2];
3285 int ThisW = 0, OtherW = 1;
3286
3287 Work[ThisW].assign(Vecs.begin(), Vecs.end());
3288 while (Work[ThisW].size() > 1) {
3289 auto *Ty = cast<VectorType>(Work[ThisW].front()->getType());
3290 SMask.resize(length(Ty) * 2);
3291 std::iota(SMask.begin(), SMask.end(), 0);
3292
3293 Work[OtherW].clear();
3294 if (Work[ThisW].size() % 2 != 0)
3295 Work[ThisW].push_back(UndefValue::get(Ty));
3296 for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
3297 Value *Joined = Builder.CreateShuffleVector(
3298 Work[ThisW][i], Work[ThisW][i + 1], SMask, "shf");
3299 Work[OtherW].push_back(Joined);
3300 }
3301 std::swap(ThisW, OtherW);
3302 }
3303
3304 // Since there may have been some undefs appended to make shuffle operands
3305 // have the same type, perform the last shuffle to only pick the original
3306 // elements.
3307 SMask.resize(Vecs.size() * length(Vecs.front()->getType()));
3308 std::iota(SMask.begin(), SMask.end(), 0);
3309 Value *Total = Work[ThisW].front();
3310 return Builder.CreateShuffleVector(Total, SMask, "shf");
3311}
3312
3313auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
3314 int NewSize, Value *Pad) const -> Value * {
3316 auto *ValTy = cast<VectorType>(Val->getType());
3317 assert(ValTy->getElementType() == Pad->getType());
3318
3319 int CurSize = length(ValTy);
3320 if (CurSize == NewSize)
3321 return Val;
3322 // Truncate?
3323 if (CurSize > NewSize)
3324 return getElementRange(Builder, Val, /*Ignored*/ Val, 0, NewSize);
3325 // Extend.
3326 SmallVector<int, 128> SMask(NewSize);
3327 std::iota(SMask.begin(), SMask.begin() + CurSize, 0);
3328 std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);
3329 Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad, "spt");
3330 return Builder.CreateShuffleVector(Val, PadVec, SMask, "shf");
3331}
3332
3333auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
3334 Type *FromTy, Type *ToTy) const -> Value * {
3335 // Mask is a vector <N x i1>, where each element corresponds to an
3336 // element of FromTy. Remap it so that each element will correspond
3337 // to an element of ToTy.
3338 assert(isa<VectorType>(Mask->getType()));
3339
3340 Type *FromSTy = FromTy->getScalarType();
3341 Type *ToSTy = ToTy->getScalarType();
3342 if (FromSTy == ToSTy)
3343 return Mask;
3344
3345 int FromSize = getSizeOf(FromSTy);
3346 int ToSize = getSizeOf(ToSTy);
3347 assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
3348
3349 auto *MaskTy = cast<VectorType>(Mask->getType());
3350 int FromCount = length(MaskTy);
3351 int ToCount = (FromCount * FromSize) / ToSize;
3352 assert((FromCount * FromSize) % ToSize == 0);
3353
3354 auto *FromITy = getIntTy(FromSize * 8);
3355 auto *ToITy = getIntTy(ToSize * 8);
3356
3357 // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
3358 // -> trunc to <M x i1>.
3359 Value *Ext = Builder.CreateSExt(
3360 Mask, VectorType::get(FromITy, FromCount, /*Scalable=*/false), "sxt");
3361 Value *Cast = Builder.CreateBitCast(
3362 Ext, VectorType::get(ToITy, ToCount, /*Scalable=*/false), "cst");
3363 return Builder.CreateTrunc(
3364 Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable=*/false), "trn");
3365}
3366
3367// Bitcast to bytes, and return least significant bits.
3368auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const
3369 -> Value * {
3370 Type *ScalarTy = Val->getType()->getScalarType();
3371 if (ScalarTy == getBoolTy())
3372 return Val;
3373
3374 Value *Bytes = vbytes(Builder, Val);
3375 if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))
3376 return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)), "trn");
3377 // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
3378 // <1 x i1>.
3379 return Builder.CreateTrunc(Bytes, getBoolTy(), "trn");
3380}
3381
3382// Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
3383auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const
3384 -> Value * {
3385 Type *ScalarTy = Val->getType()->getScalarType();
3386 if (ScalarTy == getByteTy())
3387 return Val;
3388
3389 if (ScalarTy != getBoolTy())
3390 return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)), "cst");
3391 // For bool, return a sext from i1 to i8.
3392 if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))
3393 return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy), "sxt");
3394 return Builder.CreateSExt(Val, getByteTy(), "sxt");
3395}
3396
3397auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,
3398 unsigned Start, unsigned Length) const
3399 -> Value * {
3400 assert(Start + Length <= length(Val));
3401 return getElementRange(Builder, Val, /*Ignored*/ Val, Start, Length);
3402}
3403
3404auto HexagonVectorCombine::sublo(IRBuilderBase &Builder, Value *Val) const
3405 -> Value * {
3406 size_t Len = length(Val);
3407 assert(Len % 2 == 0 && "Length should be even");
3408 return subvector(Builder, Val, 0, Len / 2);
3409}
3410
3411auto HexagonVectorCombine::subhi(IRBuilderBase &Builder, Value *Val) const
3412 -> Value * {
3413 size_t Len = length(Val);
3414 assert(Len % 2 == 0 && "Length should be even");
3415 return subvector(Builder, Val, Len / 2, Len / 2);
3416}
3417
3418auto HexagonVectorCombine::vdeal(IRBuilderBase &Builder, Value *Val0,
3419 Value *Val1) const -> Value * {
3420 assert(Val0->getType() == Val1->getType());
3421 int Len = length(Val0);
3422 SmallVector<int, 128> Mask(2 * Len);
3423
3424 for (int i = 0; i != Len; ++i) {
3425 Mask[i] = 2 * i; // Even
3426 Mask[i + Len] = 2 * i + 1; // Odd
3427 }
3428 return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
3429}
3430
3431auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,
3432 Value *Val1) const -> Value * { //
3433 assert(Val0->getType() == Val1->getType());
3434 int Len = length(Val0);
3435 SmallVector<int, 128> Mask(2 * Len);
3436
3437 for (int i = 0; i != Len; ++i) {
3438 Mask[2 * i + 0] = i; // Val0
3439 Mask[2 * i + 1] = i + Len; // Val1
3440 }
3441 return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
3442}
3443
3444auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
3445 Intrinsic::ID IntID, Type *RetTy,
3446 ArrayRef<Value *> Args,
3447 ArrayRef<Type *> ArgTys,
3448 ArrayRef<Value *> MDSources) const
3449 -> Value * {
3450 auto getCast = [&](IRBuilderBase &Builder, Value *Val,
3451 Type *DestTy) -> Value * {
3452 Type *SrcTy = Val->getType();
3453 if (SrcTy == DestTy)
3454 return Val;
3455
3456 // Non-HVX type. It should be a scalar, and it should already have
3457 // a valid type.
3458 assert(HST.isTypeForHVX(SrcTy, /*IncludeBool=*/true));
3459
3460 Type *BoolTy = Type::getInt1Ty(F.getContext());
3461 if (cast<VectorType>(SrcTy)->getElementType() != BoolTy)
3462 return Builder.CreateBitCast(Val, DestTy, "cst");
3463
3464 // Predicate HVX vector.
3465 unsigned HwLen = HST.getVectorLength();
3466 Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
3467 : Intrinsic::hexagon_V6_pred_typecast_128B;
3468 return Builder.CreateIntrinsic(TC, {DestTy, Val->getType()}, {Val},
3469 /*FMFSource=*/nullptr, "cup");
3470 };
3471
3472 Function *IntrFn =
3473 Intrinsic::getOrInsertDeclaration(F.getParent(), IntID, ArgTys);
3474 FunctionType *IntrTy = IntrFn->getFunctionType();
3475
3476 SmallVector<Value *, 4> IntrArgs;
3477 for (int i = 0, e = Args.size(); i != e; ++i) {
3478 Value *A = Args[i];
3479 Type *T = IntrTy->getParamType(i);
3480 if (A->getType() != T) {
3481 IntrArgs.push_back(getCast(Builder, A, T));
3482 } else {
3483 IntrArgs.push_back(A);
3484 }
3485 }
3486 StringRef MaybeName = !IntrTy->getReturnType()->isVoidTy() ? "cup" : "";
3487 CallInst *Call = Builder.CreateCall(IntrFn, IntrArgs, MaybeName);
3488
3489 MemoryEffects ME = Call->getAttributes().getMemoryEffects();
3491 propagateMetadata(Call, MDSources);
3492
3493 Type *CallTy = Call->getType();
3494 if (RetTy == nullptr || CallTy == RetTy)
3495 return Call;
3496 // Scalar types should have RetTy matching the call return type.
3497 assert(HST.isTypeForHVX(CallTy, /*IncludeBool=*/true));
3498 return getCast(Builder, Call, RetTy);
3499}
3500
3501auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,
3502 Value *Vec,
3503 unsigned ToWidth) const
3505 // Break a vector of wide elements into a series of vectors with narrow
3506 // elements:
3507 // (...c0:b0:a0, ...c1:b1:a1, ...c2:b2:a2, ...)
3508 // -->
3509 // (a0, a1, a2, ...) // lowest "ToWidth" bits
3510 // (b0, b1, b2, ...) // the next lowest...
3511 // (c0, c1, c2, ...) // ...
3512 // ...
3513 //
3514 // The number of elements in each resulting vector is the same as
3515 // in the original vector.
3516
3517 auto *VecTy = cast<VectorType>(Vec->getType());
3518 assert(VecTy->getElementType()->isIntegerTy());
3519 unsigned FromWidth = VecTy->getScalarSizeInBits();
3520 assert(isPowerOf2_32(ToWidth) && isPowerOf2_32(FromWidth));
3521 assert(ToWidth <= FromWidth && "Breaking up into wider elements?");
3522 unsigned NumResults = FromWidth / ToWidth;
3523
3524 SmallVector<Value *> Results(NumResults);
3525 Results[0] = Vec;
3526 unsigned Length = length(VecTy);
3527
3528 // Do it by splitting in half, since those operations correspond to deal
3529 // instructions.
3530 auto splitInHalf = [&](unsigned Begin, unsigned End, auto splitFunc) -> void {
3531 // Take V = Results[Begin], split it in L, H.
3532 // Store Results[Begin] = L, Results[(Begin+End)/2] = H
3533 // Call itself recursively split(Begin, Half), split(Half+1, End)
3534 if (Begin + 1 == End)
3535 return;
3536
3537 Value *Val = Results[Begin];
3538 unsigned Width = Val->getType()->getScalarSizeInBits();
3539
3540 auto *VTy = VectorType::get(getIntTy(Width / 2), 2 * Length, false);
3541 Value *VVal = Builder.CreateBitCast(Val, VTy, "cst");
3542
3543 Value *Res = vdeal(Builder, sublo(Builder, VVal), subhi(Builder, VVal));
3544
3545 unsigned Half = (Begin + End) / 2;
3546 Results[Begin] = sublo(Builder, Res);
3547 Results[Half] = subhi(Builder, Res);
3548
3549 splitFunc(Begin, Half, splitFunc);
3550 splitFunc(Half, End, splitFunc);
3551 };
3552
3553 splitInHalf(0, NumResults, splitInHalf);
3554 return Results;
3555}
3556
3557auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
3558 ArrayRef<Value *> Values,
3559 VectorType *ToType) const
3560 -> Value * {
3561 assert(ToType->getElementType()->isIntegerTy());
3562
3563 // If the list of values does not have power-of-2 elements, append copies
3564 // of the sign bit to it, to make the size be 2^n.
3565 // The reason for this is that the values will be joined in pairs, because
3566 // otherwise the shuffles will result in convoluted code. With pairwise
3567 // joins, the shuffles will hopefully be folded into a perfect shuffle.
3568 // The output will need to be sign-extended to a type with element width
3569 // being a power-of-2 anyways.
3570 SmallVector<Value *> Inputs(Values);
3571
3572 unsigned ToWidth = ToType->getScalarSizeInBits();
3573 unsigned Width = Inputs.front()->getType()->getScalarSizeInBits();
3574 assert(Width <= ToWidth);
3575 assert(isPowerOf2_32(Width) && isPowerOf2_32(ToWidth));
3576 unsigned Length = length(Inputs.front()->getType());
3577
3578 unsigned NeedInputs = ToWidth / Width;
3579 if (Inputs.size() != NeedInputs) {
3580 // Having too many inputs is ok: drop the high bits (usual wrap-around).
3581 // If there are too few, fill them with the sign bit.
3582 Value *Last = Inputs.back();
3583 Value *Sign = Builder.CreateAShr(
3584 Last, getConstSplat(Last->getType(), Width - 1), "asr");
3585 Inputs.resize(NeedInputs, Sign);
3586 }
3587
3588 while (Inputs.size() > 1) {
3589 Width *= 2;
3590 auto *VTy = VectorType::get(getIntTy(Width), Length, false);
3591 for (int i = 0, e = Inputs.size(); i < e; i += 2) {
3592 Value *Res = vshuff(Builder, Inputs[i], Inputs[i + 1]);
3593 Inputs[i / 2] = Builder.CreateBitCast(Res, VTy, "cst");
3594 }
3595 Inputs.resize(Inputs.size() / 2);
3596 }
3597
3598 assert(Inputs.front()->getType() == ToType);
3599 return Inputs.front();
3600}
3601
3602auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
3603 Value *Ptr1) const
3604 -> std::optional<int> {
3605 // Try SCEV first.
3606 const SCEV *Scev0 = SE.getSCEV(Ptr0);
3607 const SCEV *Scev1 = SE.getSCEV(Ptr1);
3608 const SCEV *ScevDiff = SE.getMinusSCEV(Scev0, Scev1);
3609 if (auto *Const = dyn_cast<SCEVConstant>(ScevDiff)) {
3610 APInt V = Const->getAPInt();
3611 if (V.isSignedIntN(8 * sizeof(int)))
3612 return static_cast<int>(V.getSExtValue());
3613 }
3614
3615 struct Builder : IRBuilder<> {
3616 Builder(BasicBlock *B) : IRBuilder<>(B->getTerminator()) {}
3617 ~Builder() {
3618 for (Instruction *I : llvm::reverse(ToErase))
3619 I->eraseFromParent();
3620 }
3621 SmallVector<Instruction *, 8> ToErase;
3622 };
3623
3624#define CallBuilder(B, F) \
3625 [&](auto &B_) { \
3626 Value *V = B_.F; \
3627 if (auto *I = dyn_cast<Instruction>(V)) \
3628 B_.ToErase.push_back(I); \
3629 return V; \
3630 }(B)
3631
3632 auto Simplify = [this](Value *V) {
3633 if (Value *S = simplify(V))
3634 return S;
3635 return V;
3636 };
3637
3638 auto StripBitCast = [](Value *V) {
3639 while (auto *C = dyn_cast<BitCastInst>(V))
3640 V = C->getOperand(0);
3641 return V;
3642 };
3643
3644 Ptr0 = StripBitCast(Ptr0);
3645 Ptr1 = StripBitCast(Ptr1);
3647 return std::nullopt;
3648
3649 auto *Gep0 = cast<GetElementPtrInst>(Ptr0);
3650 auto *Gep1 = cast<GetElementPtrInst>(Ptr1);
3651 if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
3652 return std::nullopt;
3653 if (Gep0->getSourceElementType() != Gep1->getSourceElementType())
3654 return std::nullopt;
3655
3656 Builder B(Gep0->getParent());
3657 int Scale = getSizeOf(Gep0->getSourceElementType(), Alloc);
3658
3659 // FIXME: for now only check GEPs with a single index.
3660 if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
3661 return std::nullopt;
3662
3663 Value *Idx0 = Gep0->getOperand(1);
3664 Value *Idx1 = Gep1->getOperand(1);
3665
3666 // First, try to simplify the subtraction directly.
3667 if (auto *Diff = dyn_cast<ConstantInt>(
3668 Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
3669 return Diff->getSExtValue() * Scale;
3670
3671 KnownBits Known0 = getKnownBits(Idx0, Gep0);
3672 KnownBits Known1 = getKnownBits(Idx1, Gep1);
3673 APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
3674 if (Unknown.isAllOnes())
3675 return std::nullopt;
3676
3677 Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);
3678 Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
3679 Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
3680 Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
3681 int Diff0 = 0;
3682 if (auto *C = dyn_cast<ConstantInt>(SubU)) {
3683 Diff0 = C->getSExtValue();
3684 } else {
3685 return std::nullopt;
3686 }
3687
3688 Value *MaskK = ConstantInt::get(MaskU->getType(), ~Unknown);
3689 Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
3690 Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
3691 Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
3692 int Diff1 = 0;
3693 if (auto *C = dyn_cast<ConstantInt>(SubK)) {
3694 Diff1 = C->getSExtValue();
3695 } else {
3696 return std::nullopt;
3697 }
3698
3699 return (Diff0 + Diff1) * Scale;
3700
3701#undef CallBuilder
3702}
3703
3704auto HexagonVectorCombine::getNumSignificantBits(const Value *V,
3705 const Instruction *CtxI) const
3706 -> unsigned {
3707 return ComputeMaxSignificantBits(V, DL, &AC, CtxI, &DT);
3708}
3709
3710auto HexagonVectorCombine::getKnownBits(const Value *V,
3711 const Instruction *CtxI) const
3712 -> KnownBits {
3713 return computeKnownBits(V, DL, &AC, CtxI, &DT);
3714}
3715
3716auto HexagonVectorCombine::isSafeToClone(const Instruction &In) const -> bool {
3717 if (In.mayHaveSideEffects() || In.isAtomic() || In.isVolatile() ||
3718 In.isFenceLike() || In.mayReadOrWriteMemory()) {
3719 return false;
3720 }
3721 if (isa<CallBase>(In) || isa<AllocaInst>(In))
3722 return false;
3723 return true;
3724}
3725
3726template <typename T>
3727auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
3729 const T &IgnoreInsts) const
3730 -> bool {
3731 auto getLocOrNone =
3732 [this](const Instruction &I) -> std::optional<MemoryLocation> {
3733 if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
3734 switch (II->getIntrinsicID()) {
3735 case Intrinsic::masked_load:
3736 return MemoryLocation::getForArgument(II, 0, TLI);
3737 case Intrinsic::masked_store:
3738 return MemoryLocation::getForArgument(II, 1, TLI);
3739 }
3740 }
3742 };
3743
3744 // The source and the destination must be in the same basic block.
3745 const BasicBlock &Block = *In.getParent();
3746 assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
3747 // No PHIs.
3748 if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))
3749 return false;
3750
3752 return true;
3753 bool MayWrite = In.mayWriteToMemory();
3754 auto MaybeLoc = getLocOrNone(In);
3755
3756 auto From = In.getIterator();
3757 if (From == To)
3758 return true;
3759 bool MoveUp = (To != Block.end() && To->comesBefore(&In));
3760 auto Range =
3761 MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
3762 for (auto It = Range.first; It != Range.second; ++It) {
3763 const Instruction &I = *It;
3764 if (llvm::is_contained(IgnoreInsts, &I))
3765 continue;
3766 // assume intrinsic can be ignored
3767 if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
3768 if (II->getIntrinsicID() == Intrinsic::assume)
3769 continue;
3770 }
3771 // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
3772 if (I.mayThrow())
3773 return false;
3774 if (auto *CB = dyn_cast<CallBase>(&I)) {
3775 if (!CB->hasFnAttr(Attribute::WillReturn))
3776 return false;
3777 if (!CB->hasFnAttr(Attribute::NoSync))
3778 return false;
3779 }
3780 if (I.mayReadOrWriteMemory()) {
3781 auto MaybeLocI = getLocOrNone(I);
3782 if (MayWrite || I.mayWriteToMemory()) {
3783 if (!MaybeLoc || !MaybeLocI)
3784 return false;
3785 if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
3786 return false;
3787 }
3788 }
3789 }
3790 return true;
3791}
3792
3793auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
3794 if (auto *VecTy = dyn_cast<VectorType>(Ty))
3795 return VecTy->getElementType() == getByteTy();
3796 return false;
3797}
3798
3799auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,
3800 Value *Hi, int Start,
3801 int Length) const -> Value * {
3802 assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));
3803 SmallVector<int, 128> SMask(Length);
3804 std::iota(SMask.begin(), SMask.end(), Start);
3805 return Builder.CreateShuffleVector(Lo, Hi, SMask, "shf");
3806}
3807
3808// Pass management.
3809
3810namespace {
3811class HexagonVectorCombineLegacy : public FunctionPass {
3812public:
3813 static char ID;
3814
3815 HexagonVectorCombineLegacy() : FunctionPass(ID) {}
3816
3817 StringRef getPassName() const override { return "Hexagon Vector Combine"; }
3818
3819 void getAnalysisUsage(AnalysisUsage &AU) const override {
3820 AU.setPreservesCFG();
3821 AU.addRequired<AAResultsWrapperPass>();
3822 AU.addRequired<AssumptionCacheTracker>();
3823 AU.addRequired<DominatorTreeWrapperPass>();
3824 AU.addRequired<ScalarEvolutionWrapperPass>();
3825 AU.addRequired<TargetLibraryInfoWrapperPass>();
3826 AU.addRequired<TargetPassConfig>();
3827 FunctionPass::getAnalysisUsage(AU);
3828 }
3829
3830 bool runOnFunction(Function &F) override {
3831 if (skipFunction(F))
3832 return false;
3833 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
3834 AssumptionCache &AC =
3835 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3836 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3837 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
3838 TargetLibraryInfo &TLI =
3839 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
3840 auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
3841 HexagonVectorCombine HVC(F, AA, AC, DT, SE, TLI, TM);
3842 return HVC.run();
3843 }
3844};
3845} // namespace
3846
3847char HexagonVectorCombineLegacy::ID = 0;
3848
3849INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
3850 "Hexagon Vector Combine", false, false)
3857INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
3858 "Hexagon Vector Combine", false, false)
3859
3861 return new HexagonVectorCombineLegacy();
3862}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
static IntegerType * getIntTy(IRBuilderBase &B, const TargetLibraryInfo *TLI)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
hexagon bit simplify
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
static Value * locateIndexesFromIntrinsic(Instruction *In)
Instruction * locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Value * getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
static Value * locateIndexesFromGEP(Value *In)
#define CallBuilder(B, F)
Value * getPointer(Value *Ptr)
#define DEFAULT_HVX_VTCM_PAGE_SIZE
static Value * locateAddressFromIntrinsic(Instruction *In)
static Instruction * selectDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Value * get_i32_Mask(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, unsigned int pattern)
bool isArithmetic(unsigned Opc)
static Type * getIndexType(Value *In)
GetElementPtrInst * locateGepFromIntrinsic(Instruction *In)
Value * getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
iv Induction Variable Users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
#define H(x, y, z)
Definition MD5.cpp:57
static bool isCandidate(const MachineInstr *MI, Register &DefedReg, Register FrameReg)
static bool isUndef(const MachineInstr &MI)
This file contains the declarations for metadata subclasses.
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static ConstantInt * getConstInt(MDNode *MD, unsigned NumOp)
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
Target-Independent Code Generator Pass Configuration Options pass.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:371
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:827
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
InstListType::const_iterator const_iterator
Definition BasicBlock.h:171
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
AttributeList getAttributes() const
Return the attributes for this call.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
iterator_range< iterator > children()
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getRootNode()
getRootNode - This returns the entry node for the CFG of the function.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:322
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const BasicBlock & back() const
Definition Function.h:860
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool isHVXVectorType(EVT VecTy, bool IncludeBool=false) const
unsigned getVectorLength() const
bool isTypeForHVX(Type *VecTy, bool IncludeBool=false) const
Intrinsic::ID getIntrinsicId(unsigned Opc) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition IRBuilder.h:1833
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2626
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2097
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2197
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
const char * getOpcodeName() const
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:215
bool onlyAccessesInaccessibleMem() const
Whether this function only (at most) accesses inaccessible memory.
Definition ModRef.h:234
static LLVM_ABI std::optional< MemoryLocation > getOrNone(const Instruction *Inst)
static LLVM_ABI MemoryLocation getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI)
Return a location representing a particular argument of a call.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
The main scalar evolution driver.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
Rounding
Possible values of current rounding mode, which is specified in bits 23:22 of FPCR.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate
Predicate - These are "(BI << 5) | BO" for various predicates.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
constexpr double e
@ User
could "use" a pointer
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI Instruction * getTerminator() const
LLVM_ABI Instruction & front() const
This is an optimization pass for GlobalISel generic memory operations.
FunctionPass * createHexagonVectorCombineLegacyPass()
@ Offset
Definition DWP.cpp:477
@ Length
Definition DWP.cpp:477
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:296
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1777
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:95
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2030
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2120
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
MaskT vshuff(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
MaskT vdeal(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316