LLVM 22.0.0git
HexagonVectorCombine.cpp
Go to the documentation of this file.
1//===-- HexagonVectorCombine.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8// HexagonVectorCombine is a utility class implementing a variety of functions
9// that assist in vector-based optimizations.
10//
11// AlignVectors: replace unaligned vector loads and stores with aligned ones.
12// HvxIdioms: recognize various opportunities to generate HVX intrinsic code.
13//===----------------------------------------------------------------------===//
14
15#include "llvm/ADT/APInt.h"
16#include "llvm/ADT/ArrayRef.h"
17#include "llvm/ADT/DenseMap.h"
18#include "llvm/ADT/STLExtras.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/IRBuilder.h"
33#include "llvm/IR/Intrinsics.h"
34#include "llvm/IR/IntrinsicsHexagon.h"
35#include "llvm/IR/Metadata.h"
38#include "llvm/Pass.h"
45
46#include "Hexagon.h"
47#include "HexagonSubtarget.h"
49
50#include <algorithm>
51#include <deque>
52#include <map>
53#include <optional>
54#include <set>
55#include <utility>
56#include <vector>
57
58#define DEBUG_TYPE "hexagon-vc"
59
60// This is a const that represents default HVX VTCM page size.
61// It is boot time configurable, so we probably want an API to
62// read it, but for now assume 128KB
63#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072
64
65using namespace llvm;
66
67namespace {
68cl::opt<bool> DumpModule("hvc-dump-module", cl::Hidden);
69cl::opt<bool> VAEnabled("hvc-va", cl::Hidden, cl::init(true)); // Align
70cl::opt<bool> VIEnabled("hvc-vi", cl::Hidden, cl::init(true)); // Idioms
71cl::opt<bool> VADoFullStores("hvc-va-full-stores", cl::Hidden);
72
73cl::opt<unsigned> VAGroupCountLimit("hvc-va-group-count-limit", cl::Hidden,
74 cl::init(~0));
75cl::opt<unsigned> VAGroupSizeLimit("hvc-va-group-size-limit", cl::Hidden,
76 cl::init(~0));
77
78class HexagonVectorCombine {
79public:
80 HexagonVectorCombine(Function &F_, AliasAnalysis &AA_, AssumptionCache &AC_,
82 TargetLibraryInfo &TLI_, const TargetMachine &TM_)
83 : F(F_), DL(F.getDataLayout()), AA(AA_), AC(AC_), DT(DT_),
84 SE(SE_), TLI(TLI_),
85 HST(static_cast<const HexagonSubtarget &>(*TM_.getSubtargetImpl(F))) {}
86
87 bool run();
88
89 // Common integer type.
90 IntegerType *getIntTy(unsigned Width = 32) const;
91 // Byte type: either scalar (when Length = 0), or vector with given
92 // element count.
93 Type *getByteTy(int ElemCount = 0) const;
94 // Boolean type: either scalar (when Length = 0), or vector with given
95 // element count.
96 Type *getBoolTy(int ElemCount = 0) const;
97 // Create a ConstantInt of type returned by getIntTy with the value Val.
98 ConstantInt *getConstInt(int Val, unsigned Width = 32) const;
99 // Get the integer value of V, if it exists.
100 std::optional<APInt> getIntValue(const Value *Val) const;
101 // Is Val a constant 0, or a vector of 0s?
102 bool isZero(const Value *Val) const;
103 // Is Val an undef value?
104 bool isUndef(const Value *Val) const;
105 // Is Val a scalar (i1 true) or a vector of (i1 true)?
106 bool isTrue(const Value *Val) const;
107 // Is Val a scalar (i1 false) or a vector of (i1 false)?
108 bool isFalse(const Value *Val) const;
109
110 // Get HVX vector type with the given element type.
111 VectorType *getHvxTy(Type *ElemTy, bool Pair = false) const;
112
113 enum SizeKind {
114 Store, // Store size
115 Alloc, // Alloc size
116 };
117 int getSizeOf(const Value *Val, SizeKind Kind = Store) const;
118 int getSizeOf(const Type *Ty, SizeKind Kind = Store) const;
119 int getTypeAlignment(Type *Ty) const;
120 size_t length(Value *Val) const;
121 size_t length(Type *Ty) const;
122
123 Constant *getNullValue(Type *Ty) const;
124 Constant *getFullValue(Type *Ty) const;
125 Constant *getConstSplat(Type *Ty, int Val) const;
126
127 Value *simplify(Value *Val) const;
128
129 Value *insertb(IRBuilderBase &Builder, Value *Dest, Value *Src, int Start,
130 int Length, int Where) const;
131 Value *vlalignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
132 Value *Amt) const;
133 Value *vralignb(IRBuilderBase &Builder, Value *Lo, Value *Hi,
134 Value *Amt) const;
135 Value *concat(IRBuilderBase &Builder, ArrayRef<Value *> Vecs) const;
136 Value *vresize(IRBuilderBase &Builder, Value *Val, int NewSize,
137 Value *Pad) const;
138 Value *rescale(IRBuilderBase &Builder, Value *Mask, Type *FromTy,
139 Type *ToTy) const;
140 Value *vlsb(IRBuilderBase &Builder, Value *Val) const;
141 Value *vbytes(IRBuilderBase &Builder, Value *Val) const;
142 Value *subvector(IRBuilderBase &Builder, Value *Val, unsigned Start,
143 unsigned Length) const;
144 Value *sublo(IRBuilderBase &Builder, Value *Val) const;
145 Value *subhi(IRBuilderBase &Builder, Value *Val) const;
146 Value *vdeal(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
147 Value *vshuff(IRBuilderBase &Builder, Value *Val0, Value *Val1) const;
148
149 Value *createHvxIntrinsic(IRBuilderBase &Builder, Intrinsic::ID IntID,
150 Type *RetTy, ArrayRef<Value *> Args,
151 ArrayRef<Type *> ArgTys = {},
152 ArrayRef<Value *> MDSources = {}) const;
153 SmallVector<Value *> splitVectorElements(IRBuilderBase &Builder, Value *Vec,
154 unsigned ToWidth) const;
155 Value *joinVectorElements(IRBuilderBase &Builder, ArrayRef<Value *> Values,
156 VectorType *ToType) const;
157
158 std::optional<int> calculatePointerDifference(Value *Ptr0, Value *Ptr1) const;
159
160 unsigned getNumSignificantBits(const Value *V,
161 const Instruction *CtxI = nullptr) const;
162 KnownBits getKnownBits(const Value *V,
163 const Instruction *CtxI = nullptr) const;
164
165 bool isSafeToClone(const Instruction &In) const;
166
167 template <typename T = std::vector<Instruction *>>
168 bool isSafeToMoveBeforeInBB(const Instruction &In,
170 const T &IgnoreInsts = {}) const;
171
172 // This function is only used for assertions at the moment.
173 [[maybe_unused]] bool isByteVecTy(Type *Ty) const;
174
175 Function &F;
176 const DataLayout &DL;
178 AssumptionCache &AC;
179 DominatorTree &DT;
180 ScalarEvolution &SE;
182 const HexagonSubtarget &HST;
183
184private:
185 Value *getElementRange(IRBuilderBase &Builder, Value *Lo, Value *Hi,
186 int Start, int Length) const;
187};
188
189class AlignVectors {
190 // This code tries to replace unaligned vector loads/stores with aligned
191 // ones.
192 // Consider unaligned load:
193 // %v = original_load %some_addr, align <bad>
194 // %user = %v
195 // It will generate
196 // = load ..., align <good>
197 // = load ..., align <good>
198 // = valign
199 // etc.
200 // %synthesize = combine/shuffle the loaded data so that it looks
201 // exactly like what "original_load" has loaded.
202 // %user = %synthesize
203 // Similarly for stores.
204public:
205 AlignVectors(const HexagonVectorCombine &HVC_) : HVC(HVC_) {}
206
207 bool run();
208
209private:
210 using InstList = std::vector<Instruction *>;
212
213 struct AddrInfo {
214 AddrInfo(const AddrInfo &) = default;
215 AddrInfo(const HexagonVectorCombine &HVC, Instruction *I, Value *A, Type *T,
216 Align H)
217 : Inst(I), Addr(A), ValTy(T), HaveAlign(H),
218 NeedAlign(HVC.getTypeAlignment(ValTy)) {}
219 AddrInfo &operator=(const AddrInfo &) = default;
220
221 // XXX: add Size member?
222 Instruction *Inst;
223 Value *Addr;
224 Type *ValTy;
225 Align HaveAlign;
226 Align NeedAlign;
227 int Offset = 0; // Offset (in bytes) from the first member of the
228 // containing AddrList.
229 };
230 using AddrList = std::vector<AddrInfo>;
231
232 struct InstrLess {
233 bool operator()(const Instruction *A, const Instruction *B) const {
234 return A->comesBefore(B);
235 }
236 };
237 using DepList = std::set<Instruction *, InstrLess>;
238
239 struct MoveGroup {
240 MoveGroup(const AddrInfo &AI, Instruction *B, bool Hvx, bool Load)
241 : Base(B), Main{AI.Inst}, Clones{}, IsHvx(Hvx), IsLoad(Load) {}
242 MoveGroup() = default;
243 Instruction *Base; // Base instruction of the parent address group.
244 InstList Main; // Main group of instructions.
245 InstList Deps; // List of dependencies.
246 InstMap Clones; // Map from original Deps to cloned ones.
247 bool IsHvx; // Is this group of HVX instructions?
248 bool IsLoad; // Is this a load group?
249 };
250 using MoveList = std::vector<MoveGroup>;
251
252 struct ByteSpan {
253 // A representation of "interesting" bytes within a given span of memory.
254 // These bytes are those that are loaded or stored, and they don't have
255 // to cover the entire span of memory.
256 //
257 // The representation works by picking a contiguous sequence of bytes
258 // from somewhere within a llvm::Value, and placing it at a given offset
259 // within the span.
260 //
261 // The sequence of bytes from llvm:Value is represented by Segment.
262 // Block is Segment, plus where it goes in the span.
263 //
264 // An important feature of ByteSpan is being able to make a "section",
265 // i.e. creating another ByteSpan corresponding to a range of offsets
266 // relative to the source span.
267
268 struct Segment {
269 // Segment of a Value: 'Len' bytes starting at byte 'Begin'.
270 Segment(Value *Val, int Begin, int Len)
271 : Val(Val), Start(Begin), Size(Len) {}
272 Segment(const Segment &Seg) = default;
273 Segment &operator=(const Segment &Seg) = default;
274 Value *Val; // Value representable as a sequence of bytes.
275 int Start; // First byte of the value that belongs to the segment.
276 int Size; // Number of bytes in the segment.
277 };
278
279 struct Block {
280 Block(Value *Val, int Len, int Pos) : Seg(Val, 0, Len), Pos(Pos) {}
281 Block(Value *Val, int Off, int Len, int Pos)
282 : Seg(Val, Off, Len), Pos(Pos) {}
283 Block(const Block &Blk) = default;
284 Block &operator=(const Block &Blk) = default;
285 Segment Seg; // Value segment.
286 int Pos; // Position (offset) of the block in the span.
287 };
288
289 int extent() const;
290 ByteSpan section(int Start, int Length) const;
291 ByteSpan &shift(int Offset);
292 SmallVector<Value *, 8> values() const;
293
294 int size() const { return Blocks.size(); }
295 Block &operator[](int i) { return Blocks[i]; }
296 const Block &operator[](int i) const { return Blocks[i]; }
297
298 std::vector<Block> Blocks;
299
300 using iterator = decltype(Blocks)::iterator;
301 iterator begin() { return Blocks.begin(); }
302 iterator end() { return Blocks.end(); }
303 using const_iterator = decltype(Blocks)::const_iterator;
304 const_iterator begin() const { return Blocks.begin(); }
305 const_iterator end() const { return Blocks.end(); }
306 };
307
308 std::optional<AddrInfo> getAddrInfo(Instruction &In) const;
309 bool isHvx(const AddrInfo &AI) const;
310 // This function is only used for assertions at the moment.
311 [[maybe_unused]] bool isSectorTy(Type *Ty) const;
312
313 Value *getPayload(Value *Val) const;
314 Value *getMask(Value *Val) const;
315 Value *getPassThrough(Value *Val) const;
316
317 Value *createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
318 int Adjust,
319 const InstMap &CloneMap = InstMap()) const;
320 Value *createAlignedPointer(IRBuilderBase &Builder, Value *Ptr, Type *ValTy,
321 int Alignment,
322 const InstMap &CloneMap = InstMap()) const;
323
324 Value *createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
325 Value *Predicate, int Alignment, Value *Mask,
326 Value *PassThru, ArrayRef<Value *> MDSources = {}) const;
327 Value *createSimpleLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
328 int Alignment,
329 ArrayRef<Value *> MDSources = {}) const;
330
331 Value *createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
332 Value *Predicate, int Alignment, Value *Mask,
333 ArrayRef<Value *> MDSources = {}) const;
334 Value *createSimpleStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
335 int Alignment,
336 ArrayRef<Value *> MDSources = {}) const;
337
338 Value *createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
339 Value *Predicate, int Alignment,
340 ArrayRef<Value *> MDSources = {}) const;
341 Value *createPredicatedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
342 Value *Predicate, int Alignment,
343 ArrayRef<Value *> MDSources = {}) const;
344
345 DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
346 bool createAddressGroups();
347 MoveList createLoadGroups(const AddrList &Group) const;
348 MoveList createStoreGroups(const AddrList &Group) const;
349 bool moveTogether(MoveGroup &Move) const;
350 template <typename T>
351 InstMap cloneBefore(BasicBlock::iterator To, T &&Insts) const;
352
353 void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
354 int ScLen, Value *AlignVal, Value *AlignAddr) const;
355 void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
356 int ScLen, Value *AlignVal, Value *AlignAddr) const;
357 bool realignGroup(const MoveGroup &Move) const;
358
359 Value *makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
360 int Alignment) const;
361
362 friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
363 friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
364 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
365 friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
366
367 std::map<Instruction *, AddrList> AddrGroups;
368 const HexagonVectorCombine &HVC;
369};
370
371[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
372 const AlignVectors::AddrInfo &AI) {
373 OS << "Inst: " << AI.Inst << " " << *AI.Inst << '\n';
374 OS << "Addr: " << *AI.Addr << '\n';
375 OS << "Type: " << *AI.ValTy << '\n';
376 OS << "HaveAlign: " << AI.HaveAlign.value() << '\n';
377 OS << "NeedAlign: " << AI.NeedAlign.value() << '\n';
378 OS << "Offset: " << AI.Offset;
379 return OS;
380}
381
382[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
383 const AlignVectors::MoveGroup &MG) {
384 OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
385 OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
386 OS << "Main\n";
387 for (Instruction *I : MG.Main)
388 OS << " " << *I << '\n';
389 OS << "Deps\n";
390 for (Instruction *I : MG.Deps)
391 OS << " " << *I << '\n';
392 OS << "Clones\n";
393 for (auto [K, V] : MG.Clones) {
394 OS << " ";
395 K->printAsOperand(OS, false);
396 OS << "\t-> " << *V << '\n';
397 }
398 return OS;
399}
400
401[[maybe_unused]] raw_ostream &
402operator<<(raw_ostream &OS, const AlignVectors::ByteSpan::Block &B) {
403 OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
404 if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {
405 OS << "(self:" << B.Seg.Val << ')';
406 } else if (B.Seg.Val != nullptr) {
407 OS << *B.Seg.Val;
408 } else {
409 OS << "(null)";
410 }
411 return OS;
412}
413
414[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
415 const AlignVectors::ByteSpan &BS) {
416 OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
417 for (const AlignVectors::ByteSpan::Block &B : BS)
418 OS << B << '\n';
419 OS << ']';
420 return OS;
421}
422
423class HvxIdioms {
424public:
425 enum DstQualifier {
426 Undefined = 0,
427 Arithmetic,
428 LdSt,
429 LLVM_Gather,
430 LLVM_Scatter,
431 HEX_Gather_Scatter,
432 HEX_Gather,
433 HEX_Scatter,
434 Call
435 };
436
437 HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
438 auto *Int32Ty = HVC.getIntTy(32);
439 HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
440 HvxP32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/true);
441 }
442
443 bool run();
444
445private:
446 enum Signedness { Positive, Signed, Unsigned };
447
448 // Value + sign
449 // This is to keep track of whether the value should be treated as signed
450 // or unsigned, or is known to be positive.
451 struct SValue {
452 Value *Val;
453 Signedness Sgn;
454 };
455
456 struct FxpOp {
457 unsigned Opcode;
458 unsigned Frac; // Number of fraction bits
459 SValue X, Y;
460 // If present, add 1 << RoundAt before shift:
461 std::optional<unsigned> RoundAt;
462 VectorType *ResTy;
463 };
464
465 auto getNumSignificantBits(Value *V, Instruction *In) const
466 -> std::pair<unsigned, Signedness>;
467 auto canonSgn(SValue X, SValue Y) const -> std::pair<SValue, SValue>;
468
469 auto matchFxpMul(Instruction &In) const -> std::optional<FxpOp>;
470 auto processFxpMul(Instruction &In, const FxpOp &Op) const -> Value *;
471
472 auto processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
473 const FxpOp &Op) const -> Value *;
474 auto createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
475 bool Rounding) const -> Value *;
476 auto createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
477 bool Rounding) const -> Value *;
478 // Return {Result, Carry}, where Carry is a vector predicate.
479 auto createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
480 Value *CarryIn = nullptr) const
481 -> std::pair<Value *, Value *>;
482 auto createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const -> Value *;
483 auto createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
484 -> Value *;
485 auto createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
486 -> std::pair<Value *, Value *>;
487 auto createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
489 auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
490 Signedness SgnX, ArrayRef<Value *> WordY,
491 Signedness SgnY) const -> SmallVector<Value *>;
492 // Vector manipulations for Ripple
493 bool matchScatter(Instruction &In) const;
494 bool matchGather(Instruction &In) const;
495 Value *processVScatter(Instruction &In) const;
496 Value *processVGather(Instruction &In) const;
497
498 VectorType *HvxI32Ty;
499 VectorType *HvxP32Ty;
500 const HexagonVectorCombine &HVC;
501
502 friend raw_ostream &operator<<(raw_ostream &, const FxpOp &);
503};
504
505[[maybe_unused]] raw_ostream &operator<<(raw_ostream &OS,
506 const HvxIdioms::FxpOp &Op) {
507 static const char *SgnNames[] = {"Positive", "Signed", "Unsigned"};
508 OS << Instruction::getOpcodeName(Op.Opcode) << '.' << Op.Frac;
509 if (Op.RoundAt.has_value()) {
510 if (Op.Frac != 0 && *Op.RoundAt == Op.Frac - 1) {
511 OS << ":rnd";
512 } else {
513 OS << " + 1<<" << *Op.RoundAt;
514 }
515 }
516 OS << "\n X:(" << SgnNames[Op.X.Sgn] << ") " << *Op.X.Val << "\n"
517 << " Y:(" << SgnNames[Op.Y.Sgn] << ") " << *Op.Y.Val;
518 return OS;
519}
520
521} // namespace
522
523namespace {
524
525template <typename T> T *getIfUnordered(T *MaybeT) {
526 return MaybeT && MaybeT->isUnordered() ? MaybeT : nullptr;
527}
528template <typename T> T *isCandidate(Instruction *In) {
529 return dyn_cast<T>(In);
530}
532 return getIfUnordered(dyn_cast<LoadInst>(In));
533}
535 return getIfUnordered(dyn_cast<StoreInst>(In));
536}
537
538#if !defined(_MSC_VER) || _MSC_VER >= 1926
539// VS2017 and some versions of VS2019 have trouble compiling this:
540// error C2976: 'std::map': too few template arguments
541// VS 2019 16.x is known to work, except for 16.4/16.5 (MSC_VER 1924/1925)
542template <typename Pred, typename... Ts>
543void erase_if(std::map<Ts...> &map, Pred p)
544#else
545template <typename Pred, typename T, typename U>
546void erase_if(std::map<T, U> &map, Pred p)
547#endif
548{
549 for (auto i = map.begin(), e = map.end(); i != e;) {
550 if (p(*i))
551 i = map.erase(i);
552 else
553 i = std::next(i);
554 }
555}
556
557// Forward other erase_ifs to the LLVM implementations.
558template <typename Pred, typename T> void erase_if(T &&container, Pred p) {
559 llvm::erase_if(std::forward<T>(container), p);
560}
561
562} // namespace
563
564// --- Begin AlignVectors
565
566// For brevity, only consider loads. We identify a group of loads where we
567// know the relative differences between their addresses, so we know how they
568// are laid out in memory (relative to one another). These loads can overlap,
569// can be shorter or longer than the desired vector length.
570// Ultimately we want to generate a sequence of aligned loads that will load
571// every byte that the original loads loaded, and have the program use these
572// loaded values instead of the original loads.
573// We consider the contiguous memory area spanned by all these loads.
574//
575// Let's say that a single aligned vector load can load 16 bytes at a time.
576// If the program wanted to use a byte at offset 13 from the beginning of the
577// original span, it will be a byte at offset 13+x in the aligned data for
578// some x>=0. This may happen to be in the first aligned load, or in the load
579// following it. Since we generally don't know what the that alignment value
580// is at compile time, we proactively do valigns on the aligned loads, so that
581// byte that was at offset 13 is still at offset 13 after the valigns.
582//
583// This will be the starting point for making the rest of the program use the
584// data loaded by the new loads.
585// For each original load, and its users:
586// %v = load ...
587// ... = %v
588// ... = %v
589// we create
590// %new_v = extract/combine/shuffle data from loaded/valigned vectors so
591// it contains the same value as %v did before
592// then replace all users of %v with %new_v.
593// ... = %new_v
594// ... = %new_v
595
596auto AlignVectors::ByteSpan::extent() const -> int {
597 if (size() == 0)
598 return 0;
599 int Min = Blocks[0].Pos;
600 int Max = Blocks[0].Pos + Blocks[0].Seg.Size;
601 for (int i = 1, e = size(); i != e; ++i) {
602 Min = std::min(Min, Blocks[i].Pos);
603 Max = std::max(Max, Blocks[i].Pos + Blocks[i].Seg.Size);
604 }
605 return Max - Min;
606}
607
608auto AlignVectors::ByteSpan::section(int Start, int Length) const -> ByteSpan {
609 ByteSpan Section;
610 for (const ByteSpan::Block &B : Blocks) {
611 int L = std::max(B.Pos, Start); // Left end.
612 int R = std::min(B.Pos + B.Seg.Size, Start + Length); // Right end+1.
613 if (L < R) {
614 // How much to chop off the beginning of the segment:
615 int Off = L > B.Pos ? L - B.Pos : 0;
616 Section.Blocks.emplace_back(B.Seg.Val, B.Seg.Start + Off, R - L, L);
617 }
618 }
619 return Section;
620}
621
622auto AlignVectors::ByteSpan::shift(int Offset) -> ByteSpan & {
623 for (Block &B : Blocks)
624 B.Pos += Offset;
625 return *this;
626}
627
628auto AlignVectors::ByteSpan::values() const -> SmallVector<Value *, 8> {
629 SmallVector<Value *, 8> Values(Blocks.size());
630 for (int i = 0, e = Blocks.size(); i != e; ++i)
631 Values[i] = Blocks[i].Seg.Val;
632 return Values;
633}
634
635auto AlignVectors::getAddrInfo(Instruction &In) const
636 -> std::optional<AddrInfo> {
637 if (auto *L = isCandidate<LoadInst>(&In))
638 return AddrInfo(HVC, L, L->getPointerOperand(), L->getType(),
639 L->getAlign());
640 if (auto *S = isCandidate<StoreInst>(&In))
641 return AddrInfo(HVC, S, S->getPointerOperand(),
642 S->getValueOperand()->getType(), S->getAlign());
643 if (auto *II = isCandidate<IntrinsicInst>(&In)) {
644 Intrinsic::ID ID = II->getIntrinsicID();
645 switch (ID) {
646 case Intrinsic::masked_load:
647 return AddrInfo(HVC, II, II->getArgOperand(0), II->getType(),
648 II->getParamAlign(0).valueOrOne());
649 case Intrinsic::masked_store:
650 return AddrInfo(HVC, II, II->getArgOperand(1),
651 II->getArgOperand(0)->getType(),
652 II->getParamAlign(1).valueOrOne());
653 }
654 }
655 return std::nullopt;
656}
657
658auto AlignVectors::isHvx(const AddrInfo &AI) const -> bool {
659 return HVC.HST.isTypeForHVX(AI.ValTy);
660}
661
662auto AlignVectors::getPayload(Value *Val) const -> Value * {
663 if (auto *In = dyn_cast<Instruction>(Val)) {
664 Intrinsic::ID ID = 0;
665 if (auto *II = dyn_cast<IntrinsicInst>(In))
666 ID = II->getIntrinsicID();
667 if (isa<StoreInst>(In) || ID == Intrinsic::masked_store)
668 return In->getOperand(0);
669 }
670 return Val;
671}
672
673auto AlignVectors::getMask(Value *Val) const -> Value * {
674 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
675 switch (II->getIntrinsicID()) {
676 case Intrinsic::masked_load:
677 return II->getArgOperand(1);
678 case Intrinsic::masked_store:
679 return II->getArgOperand(2);
680 }
681 }
682
683 Type *ValTy = getPayload(Val)->getType();
684 if (auto *VecTy = dyn_cast<VectorType>(ValTy))
685 return HVC.getFullValue(HVC.getBoolTy(HVC.length(VecTy)));
686 return HVC.getFullValue(HVC.getBoolTy());
687}
688
689auto AlignVectors::getPassThrough(Value *Val) const -> Value * {
690 if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
691 if (II->getIntrinsicID() == Intrinsic::masked_load)
692 return II->getArgOperand(2);
693 }
694 return UndefValue::get(getPayload(Val)->getType());
695}
696
697auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
698 Type *ValTy, int Adjust,
699 const InstMap &CloneMap) const
700 -> Value * {
701 if (auto *I = dyn_cast<Instruction>(Ptr))
702 if (Instruction *New = CloneMap.lookup(I))
703 Ptr = New;
704 return Builder.CreatePtrAdd(Ptr, HVC.getConstInt(Adjust), "gep");
705}
706
707auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
708 Type *ValTy, int Alignment,
709 const InstMap &CloneMap) const
710 -> Value * {
711 auto remap = [&](Value *V) -> Value * {
712 if (auto *I = dyn_cast<Instruction>(V)) {
713 for (auto [Old, New] : CloneMap)
714 I->replaceUsesOfWith(Old, New);
715 return I;
716 }
717 return V;
718 };
719 Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy(), "pti");
720 Value *Mask = HVC.getConstInt(-Alignment);
721 Value *And = Builder.CreateAnd(remap(AsInt), Mask, "and");
722 return Builder.CreateIntToPtr(
723 And, PointerType::getUnqual(ValTy->getContext()), "itp");
724}
725
726auto AlignVectors::createLoad(IRBuilderBase &Builder, Type *ValTy, Value *Ptr,
727 Value *Predicate, int Alignment, Value *Mask,
728 Value *PassThru,
729 ArrayRef<Value *> MDSources) const -> Value * {
730 bool HvxHasPredLoad = HVC.HST.useHVXV62Ops();
731 // Predicate is nullptr if not creating predicated load
732 if (Predicate) {
733 assert(!Predicate->getType()->isVectorTy() &&
734 "Expectning scalar predicate");
735 if (HVC.isFalse(Predicate))
736 return UndefValue::get(ValTy);
737 if (!HVC.isTrue(Predicate) && HvxHasPredLoad) {
738 Value *Load = createPredicatedLoad(Builder, ValTy, Ptr, Predicate,
739 Alignment, MDSources);
740 return Builder.CreateSelect(Mask, Load, PassThru);
741 }
742 // Predicate == true here.
743 }
744 assert(!HVC.isUndef(Mask)); // Should this be allowed?
745 if (HVC.isZero(Mask))
746 return PassThru;
747 if (HVC.isTrue(Mask))
748 return createSimpleLoad(Builder, ValTy, Ptr, Alignment, MDSources);
749
750 Instruction *Load = Builder.CreateMaskedLoad(ValTy, Ptr, Align(Alignment),
751 Mask, PassThru, "mld");
752 propagateMetadata(Load, MDSources);
753 return Load;
754}
755
756auto AlignVectors::createSimpleLoad(IRBuilderBase &Builder, Type *ValTy,
757 Value *Ptr, int Alignment,
758 ArrayRef<Value *> MDSources) const
759 -> Value * {
761 Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment), "ald");
762 propagateMetadata(Load, MDSources);
763 return Load;
764}
765
766auto AlignVectors::createPredicatedLoad(IRBuilderBase &Builder, Type *ValTy,
767 Value *Ptr, Value *Predicate,
768 int Alignment,
769 ArrayRef<Value *> MDSources) const
770 -> Value * {
771 assert(HVC.HST.isTypeForHVX(ValTy) &&
772 "Predicates 'scalar' vector loads not yet supported");
773 assert(Predicate);
774 assert(!Predicate->getType()->isVectorTy() && "Expectning scalar predicate");
775 assert(HVC.getSizeOf(ValTy, HVC.Alloc) % Alignment == 0);
776 if (HVC.isFalse(Predicate))
777 return UndefValue::get(ValTy);
778 if (HVC.isTrue(Predicate))
779 return createSimpleLoad(Builder, ValTy, Ptr, Alignment, MDSources);
780
781 auto V6_vL32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vL32b_pred_ai);
782 // FIXME: This may not put the offset from Ptr into the vmem offset.
783 return HVC.createHvxIntrinsic(Builder, V6_vL32b_pred_ai, ValTy,
784 {Predicate, Ptr, HVC.getConstInt(0)}, {},
785 MDSources);
786}
787
788auto AlignVectors::createStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
789 Value *Predicate, int Alignment, Value *Mask,
790 ArrayRef<Value *> MDSources) const -> Value * {
791 if (HVC.isZero(Mask) || HVC.isUndef(Val) || HVC.isUndef(Mask))
792 return UndefValue::get(Val->getType());
793 assert(!Predicate || (!Predicate->getType()->isVectorTy() &&
794 "Expectning scalar predicate"));
795 if (Predicate) {
796 if (HVC.isFalse(Predicate))
797 return UndefValue::get(Val->getType());
798 if (HVC.isTrue(Predicate))
799 Predicate = nullptr;
800 }
801 // Here both Predicate and Mask are true or unknown.
802
803 if (HVC.isTrue(Mask)) {
804 if (Predicate) { // Predicate unknown
805 return createPredicatedStore(Builder, Val, Ptr, Predicate, Alignment,
806 MDSources);
807 }
808 // Predicate is true:
809 return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
810 }
811
812 // Mask is unknown
813 if (!Predicate) {
815 Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
816 propagateMetadata(Store, MDSources);
817 return Store;
818 }
819
820 // Both Predicate and Mask are unknown.
821 // Emulate masked store with predicated-load + mux + predicated-store.
822 Value *PredLoad = createPredicatedLoad(Builder, Val->getType(), Ptr,
823 Predicate, Alignment, MDSources);
824 Value *Mux = Builder.CreateSelect(Mask, Val, PredLoad);
825 return createPredicatedStore(Builder, Mux, Ptr, Predicate, Alignment,
826 MDSources);
827}
828
829auto AlignVectors::createSimpleStore(IRBuilderBase &Builder, Value *Val,
830 Value *Ptr, int Alignment,
831 ArrayRef<Value *> MDSources) const
832 -> Value * {
833 Instruction *Store = Builder.CreateAlignedStore(Val, Ptr, Align(Alignment));
834 propagateMetadata(Store, MDSources);
835 return Store;
836}
837
838auto AlignVectors::createPredicatedStore(IRBuilderBase &Builder, Value *Val,
839 Value *Ptr, Value *Predicate,
840 int Alignment,
841 ArrayRef<Value *> MDSources) const
842 -> Value * {
843 assert(HVC.HST.isTypeForHVX(Val->getType()) &&
844 "Predicates 'scalar' vector stores not yet supported");
845 assert(Predicate);
846 if (HVC.isFalse(Predicate))
847 return UndefValue::get(Val->getType());
848 if (HVC.isTrue(Predicate))
849 return createSimpleStore(Builder, Val, Ptr, Alignment, MDSources);
850
851 assert(HVC.getSizeOf(Val, HVC.Alloc) % Alignment == 0);
852 auto V6_vS32b_pred_ai = HVC.HST.getIntrinsicId(Hexagon::V6_vS32b_pred_ai);
853 // FIXME: This may not put the offset from Ptr into the vmem offset.
854 return HVC.createHvxIntrinsic(Builder, V6_vS32b_pred_ai, nullptr,
855 {Predicate, Ptr, HVC.getConstInt(0), Val}, {},
856 MDSources);
857}
858
859auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
860 -> DepList {
861 BasicBlock *Parent = Base->getParent();
862 assert(In->getParent() == Parent &&
863 "Base and In should be in the same block");
864 assert(Base->comesBefore(In) && "Base should come before In");
865
866 DepList Deps;
867 std::deque<Instruction *> WorkQ = {In};
868 while (!WorkQ.empty()) {
869 Instruction *D = WorkQ.front();
870 WorkQ.pop_front();
871 if (D != In)
872 Deps.insert(D);
873 for (Value *Op : D->operands()) {
874 if (auto *I = dyn_cast<Instruction>(Op)) {
875 if (I->getParent() == Parent && Base->comesBefore(I))
876 WorkQ.push_back(I);
877 }
878 }
879 }
880 return Deps;
881}
882
883auto AlignVectors::createAddressGroups() -> bool {
884 // An address group created here may contain instructions spanning
885 // multiple basic blocks.
886 AddrList WorkStack;
887
888 auto findBaseAndOffset = [&](AddrInfo &AI) -> std::pair<Instruction *, int> {
889 for (AddrInfo &W : WorkStack) {
890 if (auto D = HVC.calculatePointerDifference(AI.Addr, W.Addr))
891 return std::make_pair(W.Inst, *D);
892 }
893 return std::make_pair(nullptr, 0);
894 };
895
896 auto traverseBlock = [&](DomTreeNode *DomN, auto Visit) -> void {
897 BasicBlock &Block = *DomN->getBlock();
898 for (Instruction &I : Block) {
899 auto AI = this->getAddrInfo(I); // Use this-> for gcc6.
900 if (!AI)
901 continue;
902 auto F = findBaseAndOffset(*AI);
903 Instruction *GroupInst;
904 if (Instruction *BI = F.first) {
905 AI->Offset = F.second;
906 GroupInst = BI;
907 } else {
908 WorkStack.push_back(*AI);
909 GroupInst = AI->Inst;
910 }
911 AddrGroups[GroupInst].push_back(*AI);
912 }
913
914 for (DomTreeNode *C : DomN->children())
915 Visit(C, Visit);
916
917 while (!WorkStack.empty() && WorkStack.back().Inst->getParent() == &Block)
918 WorkStack.pop_back();
919 };
920
921 traverseBlock(HVC.DT.getRootNode(), traverseBlock);
922 assert(WorkStack.empty());
923
924 // AddrGroups are formed.
925
926 // Remove groups of size 1.
927 erase_if(AddrGroups, [](auto &G) { return G.second.size() == 1; });
928 // Remove groups that don't use HVX types.
929 erase_if(AddrGroups, [&](auto &G) {
930 return llvm::none_of(
931 G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); });
932 });
933
934 return !AddrGroups.empty();
935}
936
937auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
938 // Form load groups.
939 // To avoid complications with moving code across basic blocks, only form
940 // groups that are contained within a single basic block.
941 unsigned SizeLimit = VAGroupSizeLimit;
942 if (SizeLimit == 0)
943 return {};
944
945 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
946 assert(!Move.Main.empty() && "Move group should have non-empty Main");
947 if (Move.Main.size() >= SizeLimit)
948 return false;
949 // Don't mix HVX and non-HVX instructions.
950 if (Move.IsHvx != isHvx(Info))
951 return false;
952 // Leading instruction in the load group.
953 Instruction *Base = Move.Main.front();
954 if (Base->getParent() != Info.Inst->getParent())
955 return false;
956 // Check if it's safe to move the load.
957 if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator()))
958 return false;
959 // And if it's safe to clone the dependencies.
960 auto isSafeToCopyAtBase = [&](const Instruction *I) {
961 return HVC.isSafeToMoveBeforeInBB(*I, Base->getIterator()) &&
962 HVC.isSafeToClone(*I);
963 };
964 DepList Deps = getUpwardDeps(Info.Inst, Base);
965 if (!llvm::all_of(Deps, isSafeToCopyAtBase))
966 return false;
967
968 Move.Main.push_back(Info.Inst);
969 llvm::append_range(Move.Deps, Deps);
970 return true;
971 };
972
973 MoveList LoadGroups;
974
975 for (const AddrInfo &Info : Group) {
976 if (!Info.Inst->mayReadFromMemory())
977 continue;
978 if (LoadGroups.empty() || !tryAddTo(Info, LoadGroups.back()))
979 LoadGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), true);
980 }
981
982 // Erase singleton groups.
983 erase_if(LoadGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
984
985 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
986 if (!HVC.HST.useHVXV62Ops())
987 erase_if(LoadGroups, [](const MoveGroup &G) { return G.IsHvx; });
988
989 return LoadGroups;
990}
991
992auto AlignVectors::createStoreGroups(const AddrList &Group) const -> MoveList {
993 // Form store groups.
994 // To avoid complications with moving code across basic blocks, only form
995 // groups that are contained within a single basic block.
996 unsigned SizeLimit = VAGroupSizeLimit;
997 if (SizeLimit == 0)
998 return {};
999
1000 auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
1001 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1002 if (Move.Main.size() >= SizeLimit)
1003 return false;
1004 // For stores with return values we'd have to collect downward dependencies.
1005 // There are no such stores that we handle at the moment, so omit that.
1006 assert(Info.Inst->getType()->isVoidTy() &&
1007 "Not handling stores with return values");
1008 // Don't mix HVX and non-HVX instructions.
1009 if (Move.IsHvx != isHvx(Info))
1010 return false;
1011 // For stores we need to be careful whether it's safe to move them.
1012 // Stores that are otherwise safe to move together may not appear safe
1013 // to move over one another (i.e. isSafeToMoveBefore may return false).
1014 Instruction *Base = Move.Main.front();
1015 if (Base->getParent() != Info.Inst->getParent())
1016 return false;
1017 if (!HVC.isSafeToMoveBeforeInBB(*Info.Inst, Base->getIterator(), Move.Main))
1018 return false;
1019 Move.Main.push_back(Info.Inst);
1020 return true;
1021 };
1022
1023 MoveList StoreGroups;
1024
1025 for (auto I = Group.rbegin(), E = Group.rend(); I != E; ++I) {
1026 const AddrInfo &Info = *I;
1027 if (!Info.Inst->mayWriteToMemory())
1028 continue;
1029 if (StoreGroups.empty() || !tryAddTo(Info, StoreGroups.back()))
1030 StoreGroups.emplace_back(Info, Group.front().Inst, isHvx(Info), false);
1031 }
1032
1033 // Erase singleton groups.
1034 erase_if(StoreGroups, [](const MoveGroup &G) { return G.Main.size() <= 1; });
1035
1036 // Erase HVX groups on targets < HvxV62 (due to lack of predicated loads).
1037 if (!HVC.HST.useHVXV62Ops())
1038 erase_if(StoreGroups, [](const MoveGroup &G) { return G.IsHvx; });
1039
1040 // Erase groups where every store is a full HVX vector. The reason is that
1041 // aligning predicated stores generates complex code that may be less
1042 // efficient than a sequence of unaligned vector stores.
1043 if (!VADoFullStores) {
1044 erase_if(StoreGroups, [this](const MoveGroup &G) {
1045 return G.IsHvx && llvm::all_of(G.Main, [this](Instruction *S) {
1046 auto MaybeInfo = this->getAddrInfo(*S);
1047 assert(MaybeInfo.has_value());
1048 return HVC.HST.isHVXVectorType(
1049 EVT::getEVT(MaybeInfo->ValTy, false));
1050 });
1051 });
1052 }
1053
1054 return StoreGroups;
1055}
1056
1057auto AlignVectors::moveTogether(MoveGroup &Move) const -> bool {
1058 // Move all instructions to be adjacent.
1059 assert(!Move.Main.empty() && "Move group should have non-empty Main");
1060 Instruction *Where = Move.Main.front();
1061
1062 if (Move.IsLoad) {
1063 // Move all the loads (and dependencies) to where the first load is.
1064 // Clone all deps to before Where, keeping order.
1065 Move.Clones = cloneBefore(Where->getIterator(), Move.Deps);
1066 // Move all main instructions to after Where, keeping order.
1067 ArrayRef<Instruction *> Main(Move.Main);
1068 for (Instruction *M : Main) {
1069 if (M != Where)
1070 M->moveAfter(Where);
1071 for (auto [Old, New] : Move.Clones)
1072 M->replaceUsesOfWith(Old, New);
1073 Where = M;
1074 }
1075 // Replace Deps with the clones.
1076 for (int i = 0, e = Move.Deps.size(); i != e; ++i)
1077 Move.Deps[i] = Move.Clones[Move.Deps[i]];
1078 } else {
1079 // Move all the stores to where the last store is.
1080 // NOTE: Deps are empty for "store" groups. If they need to be
1081 // non-empty, decide on the order.
1082 assert(Move.Deps.empty());
1083 // Move all main instructions to before Where, inverting order.
1084 ArrayRef<Instruction *> Main(Move.Main);
1085 for (Instruction *M : Main.drop_front(1)) {
1086 M->moveBefore(Where->getIterator());
1087 Where = M;
1088 }
1089 }
1090
1091 return Move.Main.size() + Move.Deps.size() > 1;
1092}
1093
1094template <typename T>
1095auto AlignVectors::cloneBefore(BasicBlock::iterator To, T &&Insts) const
1096 -> InstMap {
1097 InstMap Map;
1098
1099 for (Instruction *I : Insts) {
1100 assert(HVC.isSafeToClone(*I));
1101 Instruction *C = I->clone();
1102 C->setName(Twine("c.") + I->getName() + ".");
1103 C->insertBefore(To);
1104
1105 for (auto [Old, New] : Map)
1106 C->replaceUsesOfWith(Old, New);
1107 Map.insert(std::make_pair(I, C));
1108 }
1109 return Map;
1110}
1111
1112auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
1113 const ByteSpan &VSpan, int ScLen,
1114 Value *AlignVal, Value *AlignAddr) const
1115 -> void {
1116 LLVM_DEBUG(dbgs() << __func__ << "\n");
1117
1118 Type *SecTy = HVC.getByteTy(ScLen);
1119 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1120 bool DoAlign = !HVC.isZero(AlignVal);
1121 BasicBlock::iterator BasePos = Builder.GetInsertPoint();
1122 BasicBlock *BaseBlock = Builder.GetInsertBlock();
1123
1124 ByteSpan ASpan;
1125 auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
1126 auto *Undef = UndefValue::get(SecTy);
1127
1128 // Created load does not have to be "Instruction" (e.g. "undef").
1129 SmallVector<Value *> Loads(NumSectors + DoAlign, nullptr);
1130
1131 // We could create all of the aligned loads, and generate the valigns
1132 // at the location of the first load, but for large load groups, this
1133 // could create highly suboptimal code (there have been groups of 140+
1134 // loads in real code).
1135 // Instead, place the loads/valigns as close to the users as possible.
1136 // In any case we need to have a mapping from the blocks of VSpan (the
1137 // span covered by the pre-existing loads) to ASpan (the span covered
1138 // by the aligned loads). There is a small problem, though: ASpan needs
1139 // to have pointers to the loads/valigns, but we don't have these loads
1140 // because we don't know where to put them yet. We find out by creating
1141 // a section of ASpan that corresponds to values (blocks) from VSpan,
1142 // and checking where the new load should be placed. We need to attach
1143 // this location information to each block in ASpan somehow, so we put
1144 // distincts values for Seg.Val in each ASpan.Blocks[i], and use a map
1145 // to store the location for each Seg.Val.
1146 // The distinct values happen to be Blocks[i].Seg.Val = &Blocks[i],
1147 // which helps with printing ByteSpans without crashing when printing
1148 // Segments with these temporary identifiers in place of Val.
1149
1150 // Populate the blocks first, to avoid reallocations of the vector
1151 // interfering with generating the placeholder addresses.
1152 for (int Index = 0; Index != NumSectors; ++Index)
1153 ASpan.Blocks.emplace_back(nullptr, ScLen, Index * ScLen);
1154 for (int Index = 0; Index != NumSectors; ++Index) {
1155 ASpan.Blocks[Index].Seg.Val =
1156 reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
1157 }
1158
1159 // Multiple values from VSpan can map to the same value in ASpan. Since we
1160 // try to create loads lazily, we need to find the earliest use for each
1161 // value from ASpan.
1162 DenseMap<void *, Instruction *> EarliestUser;
1163 auto isEarlier = [](Instruction *A, Instruction *B) {
1164 if (B == nullptr)
1165 return true;
1166 if (A == nullptr)
1167 return false;
1168 assert(A->getParent() == B->getParent());
1169 return A->comesBefore(B);
1170 };
1171 auto earliestUser = [&](const auto &Uses) {
1172 Instruction *User = nullptr;
1173 for (const Use &U : Uses) {
1174 auto *I = dyn_cast<Instruction>(U.getUser());
1175 assert(I != nullptr && "Load used in a non-instruction?");
1176 // Make sure we only consider users in this block, but we need
1177 // to remember if there were users outside the block too. This is
1178 // because if no users are found, aligned loads will not be created.
1179 if (I->getParent() == BaseBlock) {
1180 if (!isa<PHINode>(I))
1181 User = std::min(User, I, isEarlier);
1182 } else {
1183 User = std::min(User, BaseBlock->getTerminator(), isEarlier);
1184 }
1185 }
1186 return User;
1187 };
1188
1189 for (const ByteSpan::Block &B : VSpan) {
1190 ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);
1191 for (const ByteSpan::Block &S : ASection) {
1192 auto &EU = EarliestUser[S.Seg.Val];
1193 EU = std::min(EU, earliestUser(B.Seg.Val->uses()), isEarlier);
1194 }
1195 }
1196
1197 LLVM_DEBUG({
1198 dbgs() << "ASpan:\n" << ASpan << '\n';
1199 dbgs() << "Earliest users of ASpan:\n";
1200 for (auto &[Val, User] : EarliestUser) {
1201 dbgs() << Val << "\n ->" << *User << '\n';
1202 }
1203 });
1204
1205 auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
1206 int Index, bool MakePred) {
1207 Value *Ptr =
1208 createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1209 Value *Predicate =
1210 MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1211
1212 // If vector shifting is potentially needed, accumulate metadata
1213 // from source sections of twice the load width.
1214 int Start = (Index - DoAlign) * ScLen;
1215 int Width = (1 + DoAlign) * ScLen;
1216 return this->createLoad(Builder, SecTy, Ptr, Predicate, ScLen, True, Undef,
1217 VSpan.section(Start, Width).values());
1218 };
1219
1220 auto moveBefore = [this](BasicBlock::iterator In, BasicBlock::iterator To) {
1221 // Move In and its upward dependencies to before To.
1222 assert(In->getParent() == To->getParent());
1223 DepList Deps = getUpwardDeps(&*In, &*To);
1224 In->moveBefore(To);
1225 // DepList is sorted with respect to positions in the basic block.
1226 InstMap Map = cloneBefore(In, Deps);
1227 for (auto [Old, New] : Map)
1228 In->replaceUsesOfWith(Old, New);
1229 };
1230
1231 // Generate necessary loads at appropriate locations.
1232 LLVM_DEBUG(dbgs() << "Creating loads for ASpan sectors\n");
1233 for (int Index = 0; Index != NumSectors + 1; ++Index) {
1234 // In ASpan, each block will be either a single aligned load, or a
1235 // valign of a pair of loads. In the latter case, an aligned load j
1236 // will belong to the current valign, and the one in the previous
1237 // block (for j > 0).
1238 // Place the load at a location which will dominate the valign, assuming
1239 // the valign will be placed right before the earliest user.
1240 Instruction *PrevAt =
1241 DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
1242 Instruction *ThisAt =
1243 Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
1244 if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {
1245 Builder.SetInsertPoint(Where);
1246 Loads[Index] =
1247 createLoad(Builder, VSpan, Index, DoAlign && Index == NumSectors);
1248 // We know it's safe to put the load at BasePos, but we'd prefer to put
1249 // it at "Where". To see if the load is safe to be placed at Where, put
1250 // it there first and then check if it's safe to move it to BasePos.
1251 // If not, then the load needs to be placed at BasePos.
1252 // We can't do this check proactively because we need the load to exist
1253 // in order to check legality.
1254 if (auto *Load = dyn_cast<Instruction>(Loads[Index])) {
1255 if (!HVC.isSafeToMoveBeforeInBB(*Load, BasePos))
1256 moveBefore(Load->getIterator(), BasePos);
1257 }
1258 LLVM_DEBUG(dbgs() << "Loads[" << Index << "]:" << *Loads[Index] << '\n');
1259 }
1260 }
1261
1262 // Generate valigns if needed, and fill in proper values in ASpan
1263 LLVM_DEBUG(dbgs() << "Creating values for ASpan sectors\n");
1264 for (int Index = 0; Index != NumSectors; ++Index) {
1265 ASpan[Index].Seg.Val = nullptr;
1266 if (auto *Where = EarliestUser[&ASpan[Index]]) {
1267 Builder.SetInsertPoint(Where);
1268 Value *Val = Loads[Index];
1269 assert(Val != nullptr);
1270 if (DoAlign) {
1271 Value *NextLoad = Loads[Index + 1];
1272 assert(NextLoad != nullptr);
1273 Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);
1274 }
1275 ASpan[Index].Seg.Val = Val;
1276 LLVM_DEBUG(dbgs() << "ASpan[" << Index << "]:" << *Val << '\n');
1277 }
1278 }
1279
1280 for (const ByteSpan::Block &B : VSpan) {
1281 ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
1282 Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
1283 Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));
1284
1285 // We're generating a reduction, where each instruction depends on
1286 // the previous one, so we need to order them according to the position
1287 // of their inputs in the code.
1288 std::vector<ByteSpan::Block *> ABlocks;
1289 for (ByteSpan::Block &S : ASection) {
1290 if (S.Seg.Val != nullptr)
1291 ABlocks.push_back(&S);
1292 }
1293 llvm::sort(ABlocks,
1294 [&](const ByteSpan::Block *A, const ByteSpan::Block *B) {
1295 return isEarlier(cast<Instruction>(A->Seg.Val),
1296 cast<Instruction>(B->Seg.Val));
1297 });
1298 for (ByteSpan::Block *S : ABlocks) {
1299 // The processing of the data loaded by the aligned loads
1300 // needs to be inserted after the data is available.
1301 Instruction *SegI = cast<Instruction>(S->Seg.Val);
1302 Builder.SetInsertPoint(&*std::next(SegI->getIterator()));
1303 Value *Pay = HVC.vbytes(Builder, getPayload(S->Seg.Val));
1304 Accum =
1305 HVC.insertb(Builder, Accum, Pay, S->Seg.Start, S->Seg.Size, S->Pos);
1306 }
1307 // Instead of casting everything to bytes for the vselect, cast to the
1308 // original value type. This will avoid complications with casting masks.
1309 // For example, in cases when the original mask applied to i32, it could
1310 // be converted to a mask applicable to i8 via pred_typecast intrinsic,
1311 // but if the mask is not exactly of HVX length, extra handling would be
1312 // needed to make it work.
1313 Type *ValTy = getPayload(B.Seg.Val)->getType();
1314 Value *Cast = Builder.CreateBitCast(Accum, ValTy, "cst");
1315 Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
1316 getPassThrough(B.Seg.Val), "sel");
1317 B.Seg.Val->replaceAllUsesWith(Sel);
1318 }
1319}
1320
1321auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
1322 const ByteSpan &VSpan, int ScLen,
1323 Value *AlignVal, Value *AlignAddr) const
1324 -> void {
1325 LLVM_DEBUG(dbgs() << __func__ << "\n");
1326
1327 Type *SecTy = HVC.getByteTy(ScLen);
1328 int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
1329 bool DoAlign = !HVC.isZero(AlignVal);
1330
1331 // Stores.
1332 ByteSpan ASpanV, ASpanM;
1333
1334 // Return a vector value corresponding to the input value Val:
1335 // either <1 x Val> for scalar Val, or Val itself for vector Val.
1336 auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
1337 Type *Ty = Val->getType();
1338 if (Ty->isVectorTy())
1339 return Val;
1340 auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
1341 return Builder.CreateBitCast(Val, VecTy, "cst");
1342 };
1343
1344 // Create an extra "undef" sector at the beginning and at the end.
1345 // They will be used as the left/right filler in the vlalign step.
1346 for (int Index = (DoAlign ? -1 : 0); Index != NumSectors + DoAlign; ++Index) {
1347 // For stores, the size of each section is an aligned vector length.
1348 // Adjust the store offsets relative to the section start offset.
1349 ByteSpan VSection =
1350 VSpan.section(Index * ScLen, ScLen).shift(-Index * ScLen);
1351 Value *Undef = UndefValue::get(SecTy);
1352 Value *Zero = HVC.getNullValue(SecTy);
1353 Value *AccumV = Undef;
1354 Value *AccumM = Zero;
1355 for (ByteSpan::Block &S : VSection) {
1356 Value *Pay = getPayload(S.Seg.Val);
1357 Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
1358 Pay->getType(), HVC.getByteTy());
1359 Value *PartM = HVC.insertb(Builder, Zero, HVC.vbytes(Builder, Mask),
1360 S.Seg.Start, S.Seg.Size, S.Pos);
1361 AccumM = Builder.CreateOr(AccumM, PartM);
1362
1363 Value *PartV = HVC.insertb(Builder, Undef, HVC.vbytes(Builder, Pay),
1364 S.Seg.Start, S.Seg.Size, S.Pos);
1365
1366 AccumV = Builder.CreateSelect(
1367 Builder.CreateICmp(CmpInst::ICMP_NE, PartM, Zero), PartV, AccumV);
1368 }
1369 ASpanV.Blocks.emplace_back(AccumV, ScLen, Index * ScLen);
1370 ASpanM.Blocks.emplace_back(AccumM, ScLen, Index * ScLen);
1371 }
1372
1373 LLVM_DEBUG({
1374 dbgs() << "ASpanV before vlalign:\n" << ASpanV << '\n';
1375 dbgs() << "ASpanM before vlalign:\n" << ASpanM << '\n';
1376 });
1377
1378 // vlalign
1379 if (DoAlign) {
1380 for (int Index = 1; Index != NumSectors + 2; ++Index) {
1381 Value *PrevV = ASpanV[Index - 1].Seg.Val, *ThisV = ASpanV[Index].Seg.Val;
1382 Value *PrevM = ASpanM[Index - 1].Seg.Val, *ThisM = ASpanM[Index].Seg.Val;
1383 assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
1384 ASpanV[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
1385 ASpanM[Index - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
1386 }
1387 }
1388
1389 LLVM_DEBUG({
1390 dbgs() << "ASpanV after vlalign:\n" << ASpanV << '\n';
1391 dbgs() << "ASpanM after vlalign:\n" << ASpanM << '\n';
1392 });
1393
1394 auto createStore = [&](IRBuilderBase &Builder, const ByteSpan &ASpanV,
1395 const ByteSpan &ASpanM, int Index, bool MakePred) {
1396 Value *Val = ASpanV[Index].Seg.Val;
1397 Value *Mask = ASpanM[Index].Seg.Val; // bytes
1398 if (HVC.isUndef(Val) || HVC.isZero(Mask))
1399 return;
1400 Value *Ptr =
1401 createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
1402 Value *Predicate =
1403 MakePred ? makeTestIfUnaligned(Builder, AlignVal, ScLen) : nullptr;
1404
1405 // If vector shifting is potentially needed, accumulate metadata
1406 // from source sections of twice the store width.
1407 int Start = (Index - DoAlign) * ScLen;
1408 int Width = (1 + DoAlign) * ScLen;
1409 this->createStore(Builder, Val, Ptr, Predicate, ScLen,
1410 HVC.vlsb(Builder, Mask),
1411 VSpan.section(Start, Width).values());
1412 };
1413
1414 for (int Index = 0; Index != NumSectors + DoAlign; ++Index) {
1415 createStore(Builder, ASpanV, ASpanM, Index, DoAlign && Index == NumSectors);
1416 }
1417}
1418
1419auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
1420 LLVM_DEBUG(dbgs() << "Realigning group:\n" << Move << '\n');
1421
1422 // TODO: Needs support for masked loads/stores of "scalar" vectors.
1423 if (!Move.IsHvx)
1424 return false;
1425
1426 // Return the element with the maximum alignment from Range,
1427 // where GetValue obtains the value to compare from an element.
1428 auto getMaxOf = [](auto Range, auto GetValue) {
1429 return *llvm::max_element(Range, [&GetValue](auto &A, auto &B) {
1430 return GetValue(A) < GetValue(B);
1431 });
1432 };
1433
1434 const AddrList &BaseInfos = AddrGroups.at(Move.Base);
1435
1436 // Conceptually, there is a vector of N bytes covering the addresses
1437 // starting from the minimum offset (i.e. Base.Addr+Start). This vector
1438 // represents a contiguous memory region that spans all accessed memory
1439 // locations.
1440 // The correspondence between loaded or stored values will be expressed
1441 // in terms of this vector. For example, the 0th element of the vector
1442 // from the Base address info will start at byte Start from the beginning
1443 // of this conceptual vector.
1444 //
1445 // This vector will be loaded/stored starting at the nearest down-aligned
1446 // address and the amount od the down-alignment will be AlignVal:
1447 // valign(load_vector(align_down(Base+Start)), AlignVal)
1448
1449 std::set<Instruction *> TestSet(Move.Main.begin(), Move.Main.end());
1450 AddrList MoveInfos;
1452 BaseInfos, std::back_inserter(MoveInfos),
1453 [&TestSet](const AddrInfo &AI) { return TestSet.count(AI.Inst); });
1454
1455 // Maximum alignment present in the whole address group.
1456 const AddrInfo &WithMaxAlign =
1457 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
1458 Align MaxGiven = WithMaxAlign.HaveAlign;
1459
1460 // Minimum alignment present in the move address group.
1461 const AddrInfo &WithMinOffset =
1462 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return -AI.Offset; });
1463
1464 const AddrInfo &WithMaxNeeded =
1465 getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
1466 Align MinNeeded = WithMaxNeeded.NeedAlign;
1467
1468 // Set the builder's insertion point right before the load group, or
1469 // immediately after the store group. (Instructions in a store group are
1470 // listed in reverse order.)
1471 Instruction *InsertAt = Move.Main.front();
1472 if (!Move.IsLoad) {
1473 // There should be a terminator (which store isn't, but check anyways).
1474 assert(InsertAt->getIterator() != InsertAt->getParent()->end());
1475 InsertAt = &*std::next(InsertAt->getIterator());
1476 }
1477
1478 IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
1479 InstSimplifyFolder(HVC.DL));
1480 Value *AlignAddr = nullptr; // Actual aligned address.
1481 Value *AlignVal = nullptr; // Right-shift amount (for valign).
1482
1483 if (MinNeeded <= MaxGiven) {
1484 int Start = WithMinOffset.Offset;
1485 int OffAtMax = WithMaxAlign.Offset;
1486 // Shift the offset of the maximally aligned instruction (OffAtMax)
1487 // back by just enough multiples of the required alignment to cover the
1488 // distance from Start to OffAtMax.
1489 // Calculate the address adjustment amount based on the address with the
1490 // maximum alignment. This is to allow a simple gep instruction instead
1491 // of potential bitcasts to i8*.
1492 int Adjust = -alignTo(OffAtMax - Start, MinNeeded.value());
1493 AlignAddr = createAdjustedPointer(Builder, WithMaxAlign.Addr,
1494 WithMaxAlign.ValTy, Adjust, Move.Clones);
1495 int Diff = Start - (OffAtMax + Adjust);
1496 AlignVal = HVC.getConstInt(Diff);
1497 assert(Diff >= 0);
1498 assert(static_cast<decltype(MinNeeded.value())>(Diff) < MinNeeded.value());
1499 } else {
1500 // WithMinOffset is the lowest address in the group,
1501 // WithMinOffset.Addr = Base+Start.
1502 // Align instructions for both HVX (V6_valign) and scalar (S2_valignrb)
1503 // mask off unnecessary bits, so it's ok to just the original pointer as
1504 // the alignment amount.
1505 // Do an explicit down-alignment of the address to avoid creating an
1506 // aligned instruction with an address that is not really aligned.
1507 AlignAddr =
1508 createAlignedPointer(Builder, WithMinOffset.Addr, WithMinOffset.ValTy,
1509 MinNeeded.value(), Move.Clones);
1510 AlignVal =
1511 Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy(), "pti");
1512 if (auto *I = dyn_cast<Instruction>(AlignVal)) {
1513 for (auto [Old, New] : Move.Clones)
1514 I->replaceUsesOfWith(Old, New);
1515 }
1516 }
1517
1518 ByteSpan VSpan;
1519 for (const AddrInfo &AI : MoveInfos) {
1520 VSpan.Blocks.emplace_back(AI.Inst, HVC.getSizeOf(AI.ValTy),
1521 AI.Offset - WithMinOffset.Offset);
1522 }
1523
1524 // The aligned loads/stores will use blocks that are either scalars,
1525 // or HVX vectors. Let "sector" be the unified term for such a block.
1526 // blend(scalar, vector) -> sector...
1527 int ScLen = Move.IsHvx ? HVC.HST.getVectorLength()
1528 : std::max<int>(MinNeeded.value(), 4);
1529 assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
1530 assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
1531
1532 LLVM_DEBUG({
1533 dbgs() << "ScLen: " << ScLen << "\n";
1534 dbgs() << "AlignVal:" << *AlignVal << "\n";
1535 dbgs() << "AlignAddr:" << *AlignAddr << "\n";
1536 dbgs() << "VSpan:\n" << VSpan << '\n';
1537 });
1538
1539 if (Move.IsLoad)
1540 realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1541 else
1542 realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
1543
1544 for (auto *Inst : Move.Main)
1545 Inst->eraseFromParent();
1546
1547 return true;
1548}
1549
1550auto AlignVectors::makeTestIfUnaligned(IRBuilderBase &Builder, Value *AlignVal,
1551 int Alignment) const -> Value * {
1552 auto *AlignTy = AlignVal->getType();
1553 Value *And = Builder.CreateAnd(
1554 AlignVal, ConstantInt::get(AlignTy, Alignment - 1), "and");
1555 Value *Zero = ConstantInt::get(AlignTy, 0);
1556 return Builder.CreateICmpNE(And, Zero, "isz");
1557}
1558
1559auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
1560 if (!HVC.isByteVecTy(Ty))
1561 return false;
1562 int Size = HVC.getSizeOf(Ty);
1563 if (HVC.HST.isTypeForHVX(Ty))
1564 return Size == static_cast<int>(HVC.HST.getVectorLength());
1565 return Size == 4 || Size == 8;
1566}
1567
1568auto AlignVectors::run() -> bool {
1569 LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()
1570 << '\n');
1571 if (!createAddressGroups())
1572 return false;
1573
1574 LLVM_DEBUG({
1575 dbgs() << "Address groups(" << AddrGroups.size() << "):\n";
1576 for (auto &[In, AL] : AddrGroups) {
1577 for (const AddrInfo &AI : AL)
1578 dbgs() << "---\n" << AI << '\n';
1579 }
1580 });
1581
1582 bool Changed = false;
1583 MoveList LoadGroups, StoreGroups;
1584
1585 for (auto &G : AddrGroups) {
1586 llvm::append_range(LoadGroups, createLoadGroups(G.second));
1587 llvm::append_range(StoreGroups, createStoreGroups(G.second));
1588 }
1589
1590 LLVM_DEBUG({
1591 dbgs() << "\nLoad groups(" << LoadGroups.size() << "):\n";
1592 for (const MoveGroup &G : LoadGroups)
1593 dbgs() << G << "\n";
1594 dbgs() << "Store groups(" << StoreGroups.size() << "):\n";
1595 for (const MoveGroup &G : StoreGroups)
1596 dbgs() << G << "\n";
1597 });
1598
1599 // Cumulative limit on the number of groups.
1600 unsigned CountLimit = VAGroupCountLimit;
1601 if (CountLimit == 0)
1602 return false;
1603
1604 if (LoadGroups.size() > CountLimit) {
1605 LoadGroups.resize(CountLimit);
1606 StoreGroups.clear();
1607 } else {
1608 unsigned StoreLimit = CountLimit - LoadGroups.size();
1609 if (StoreGroups.size() > StoreLimit)
1610 StoreGroups.resize(StoreLimit);
1611 }
1612
1613 for (auto &M : LoadGroups)
1614 Changed |= moveTogether(M);
1615 for (auto &M : StoreGroups)
1616 Changed |= moveTogether(M);
1617
1618 LLVM_DEBUG(dbgs() << "After moveTogether:\n" << HVC.F);
1619
1620 for (auto &M : LoadGroups)
1621 Changed |= realignGroup(M);
1622 for (auto &M : StoreGroups)
1623 Changed |= realignGroup(M);
1624
1625 return Changed;
1626}
1627
1628// --- End AlignVectors
1629
1630// --- Begin HvxIdioms
1631
1632auto HvxIdioms::getNumSignificantBits(Value *V, Instruction *In) const
1633 -> std::pair<unsigned, Signedness> {
1634 unsigned Bits = HVC.getNumSignificantBits(V, In);
1635 // The significant bits are calculated including the sign bit. This may
1636 // add an extra bit for zero-extended values, e.g. (zext i32 to i64) may
1637 // result in 33 significant bits. To avoid extra words, skip the extra
1638 // sign bit, but keep information that the value is to be treated as
1639 // unsigned.
1640 KnownBits Known = HVC.getKnownBits(V, In);
1641 Signedness Sign = Signed;
1642 unsigned NumToTest = 0; // Number of bits used in test for unsignedness.
1643 if (isPowerOf2_32(Bits))
1644 NumToTest = Bits;
1645 else if (Bits > 1 && isPowerOf2_32(Bits - 1))
1646 NumToTest = Bits - 1;
1647
1648 if (NumToTest != 0 && Known.Zero.ashr(NumToTest).isAllOnes()) {
1649 Sign = Unsigned;
1650 Bits = NumToTest;
1651 }
1652
1653 // If the top bit of the nearest power-of-2 is zero, this value is
1654 // positive. It could be treated as either signed or unsigned.
1655 if (unsigned Pow2 = PowerOf2Ceil(Bits); Pow2 != Bits) {
1656 if (Known.Zero.ashr(Pow2 - 1).isAllOnes())
1657 Sign = Positive;
1658 }
1659 return {Bits, Sign};
1660}
1661
1662auto HvxIdioms::canonSgn(SValue X, SValue Y) const
1663 -> std::pair<SValue, SValue> {
1664 // Canonicalize the signedness of X and Y, so that the result is one of:
1665 // S, S
1666 // U/P, S
1667 // U/P, U/P
1668 if (X.Sgn == Signed && Y.Sgn != Signed)
1669 std::swap(X, Y);
1670 return {X, Y};
1671}
1672
1673// Match
1674// (X * Y) [>> N], or
1675// ((X * Y) + (1 << M)) >> N
1676auto HvxIdioms::matchFxpMul(Instruction &In) const -> std::optional<FxpOp> {
1677 using namespace PatternMatch;
1678 auto *Ty = In.getType();
1679
1680 if (!Ty->isVectorTy() || !Ty->getScalarType()->isIntegerTy())
1681 return std::nullopt;
1682
1683 unsigned Width = cast<IntegerType>(Ty->getScalarType())->getBitWidth();
1684
1685 FxpOp Op;
1686 Value *Exp = &In;
1687
1688 // Fixed-point multiplication is always shifted right (except when the
1689 // fraction is 0 bits).
1690 auto m_Shr = [](auto &&V, auto &&S) {
1691 return m_CombineOr(m_LShr(V, S), m_AShr(V, S));
1692 };
1693
1694 uint64_t Qn = 0;
1695 if (Value *T; match(Exp, m_Shr(m_Value(T), m_ConstantInt(Qn)))) {
1696 Op.Frac = Qn;
1697 Exp = T;
1698 } else {
1699 Op.Frac = 0;
1700 }
1701
1702 if (Op.Frac > Width)
1703 return std::nullopt;
1704
1705 // Check if there is rounding added.
1706 uint64_t CV;
1707 if (Value *T;
1708 Op.Frac > 0 && match(Exp, m_Add(m_Value(T), m_ConstantInt(CV)))) {
1709 if (CV != 0 && !isPowerOf2_64(CV))
1710 return std::nullopt;
1711 if (CV != 0)
1712 Op.RoundAt = Log2_64(CV);
1713 Exp = T;
1714 }
1715
1716 // Check if the rest is a multiplication.
1717 if (match(Exp, m_Mul(m_Value(Op.X.Val), m_Value(Op.Y.Val)))) {
1718 Op.Opcode = Instruction::Mul;
1719 // FIXME: The information below is recomputed.
1720 Op.X.Sgn = getNumSignificantBits(Op.X.Val, &In).second;
1721 Op.Y.Sgn = getNumSignificantBits(Op.Y.Val, &In).second;
1722 Op.ResTy = cast<VectorType>(Ty);
1723 return Op;
1724 }
1725
1726 return std::nullopt;
1727}
1728
1729auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
1730 -> Value * {
1731 assert(Op.X.Val->getType() == Op.Y.Val->getType());
1732
1733 auto *VecTy = dyn_cast<VectorType>(Op.X.Val->getType());
1734 if (VecTy == nullptr)
1735 return nullptr;
1736 auto *ElemTy = cast<IntegerType>(VecTy->getElementType());
1737 unsigned ElemWidth = ElemTy->getBitWidth();
1738
1739 // TODO: This can be relaxed after legalization is done pre-isel.
1740 if ((HVC.length(VecTy) * ElemWidth) % (8 * HVC.HST.getVectorLength()) != 0)
1741 return nullptr;
1742
1743 // There are no special intrinsics that should be used for multiplying
1744 // signed 8-bit values, so just skip them. Normal codegen should handle
1745 // this just fine.
1746 if (ElemWidth <= 8)
1747 return nullptr;
1748 // Similarly, if this is just a multiplication that can be handled without
1749 // intervention, then leave it alone.
1750 if (ElemWidth <= 32 && Op.Frac == 0)
1751 return nullptr;
1752
1753 auto [BitsX, SignX] = getNumSignificantBits(Op.X.Val, &In);
1754 auto [BitsY, SignY] = getNumSignificantBits(Op.Y.Val, &In);
1755
1756 // TODO: Add multiplication of vectors by scalar registers (up to 4 bytes).
1757
1758 Value *X = Op.X.Val, *Y = Op.Y.Val;
1759 IRBuilder Builder(In.getParent(), In.getIterator(),
1760 InstSimplifyFolder(HVC.DL));
1761
1762 auto roundUpWidth = [](unsigned Width) -> unsigned {
1763 if (Width <= 32 && !isPowerOf2_32(Width)) {
1764 // If the element width is not a power of 2, round it up
1765 // to the next one. Do this for widths not exceeding 32.
1766 return PowerOf2Ceil(Width);
1767 }
1768 if (Width > 32 && Width % 32 != 0) {
1769 // For wider elements, round it up to the multiple of 32.
1770 return alignTo(Width, 32u);
1771 }
1772 return Width;
1773 };
1774
1775 BitsX = roundUpWidth(BitsX);
1776 BitsY = roundUpWidth(BitsY);
1777
1778 // For elementwise multiplication vectors must have the same lengths, so
1779 // resize the elements of both inputs to the same width, the max of the
1780 // calculated significant bits.
1781 unsigned Width = std::max(BitsX, BitsY);
1782
1783 auto *ResizeTy = VectorType::get(HVC.getIntTy(Width), VecTy);
1784 if (Width < ElemWidth) {
1785 X = Builder.CreateTrunc(X, ResizeTy, "trn");
1786 Y = Builder.CreateTrunc(Y, ResizeTy, "trn");
1787 } else if (Width > ElemWidth) {
1788 X = SignX == Signed ? Builder.CreateSExt(X, ResizeTy, "sxt")
1789 : Builder.CreateZExt(X, ResizeTy, "zxt");
1790 Y = SignY == Signed ? Builder.CreateSExt(Y, ResizeTy, "sxt")
1791 : Builder.CreateZExt(Y, ResizeTy, "zxt");
1792 };
1793
1794 assert(X->getType() == Y->getType() && X->getType() == ResizeTy);
1795
1796 unsigned VecLen = HVC.length(ResizeTy);
1797 unsigned ChopLen = (8 * HVC.HST.getVectorLength()) / std::min(Width, 32u);
1798
1800 FxpOp ChopOp = Op;
1801 ChopOp.ResTy = VectorType::get(Op.ResTy->getElementType(), ChopLen, false);
1802
1803 for (unsigned V = 0; V != VecLen / ChopLen; ++V) {
1804 ChopOp.X.Val = HVC.subvector(Builder, X, V * ChopLen, ChopLen);
1805 ChopOp.Y.Val = HVC.subvector(Builder, Y, V * ChopLen, ChopLen);
1806 Results.push_back(processFxpMulChopped(Builder, In, ChopOp));
1807 if (Results.back() == nullptr)
1808 break;
1809 }
1810
1811 if (Results.empty() || Results.back() == nullptr)
1812 return nullptr;
1813
1814 Value *Cat = HVC.concat(Builder, Results);
1815 Value *Ext = SignX == Signed || SignY == Signed
1816 ? Builder.CreateSExt(Cat, VecTy, "sxt")
1817 : Builder.CreateZExt(Cat, VecTy, "zxt");
1818 return Ext;
1819}
1820
1821inline bool HvxIdioms::matchScatter(Instruction &In) const {
1822 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1823 if (!II)
1824 return false;
1825 return (II->getIntrinsicID() == Intrinsic::masked_scatter);
1826}
1827
1828inline bool HvxIdioms::matchGather(Instruction &In) const {
1829 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
1830 if (!II)
1831 return false;
1832 return (II->getIntrinsicID() == Intrinsic::masked_gather);
1833}
1834
1835Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);
1836
1837// Binary instructions we want to handle as users of gather/scatter.
1838inline bool isArithmetic(unsigned Opc) {
1839 switch (Opc) {
1840 case Instruction::Add:
1841 case Instruction::Sub:
1842 case Instruction::Mul:
1843 case Instruction::And:
1844 case Instruction::Or:
1845 case Instruction::Xor:
1846 case Instruction::AShr:
1847 case Instruction::LShr:
1848 case Instruction::Shl:
1849 case Instruction::UDiv:
1850 return true;
1851 }
1852 return false;
1853}
1854
1855// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.
1856inline Value *getPointer(Value *Ptr) {
1857 assert(Ptr && "Unable to extract pointer");
1858 if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr))
1859 return Ptr;
1860 if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr))
1861 return getLoadStorePointerOperand(Ptr);
1863 if (II->getIntrinsicID() == Intrinsic::masked_store)
1864 return II->getOperand(1);
1865 }
1866 return nullptr;
1867}
1868
1870 HvxIdioms::DstQualifier &Qual) {
1871 Instruction *Destination = nullptr;
1872 if (!In)
1873 return Destination;
1874 if (isa<StoreInst>(In)) {
1875 Destination = In;
1876 Qual = HvxIdioms::LdSt;
1877 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
1878 if (II->getIntrinsicID() == Intrinsic::masked_gather) {
1879 Destination = In;
1880 Qual = HvxIdioms::LLVM_Gather;
1881 } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {
1882 Destination = In;
1883 Qual = HvxIdioms::LLVM_Scatter;
1884 } else if (II->getIntrinsicID() == Intrinsic::masked_store) {
1885 Destination = In;
1886 Qual = HvxIdioms::LdSt;
1887 } else if (II->getIntrinsicID() ==
1888 Intrinsic::hexagon_V6_vgather_vscattermh) {
1889 Destination = In;
1890 Qual = HvxIdioms::HEX_Gather_Scatter;
1891 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {
1892 Destination = In;
1893 Qual = HvxIdioms::HEX_Scatter;
1894 } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {
1895 Destination = In;
1896 Qual = HvxIdioms::HEX_Gather;
1897 }
1898 } else if (isa<ZExtInst>(In)) {
1899 return locateDestination(In, Qual);
1900 } else if (isa<CastInst>(In)) {
1901 return locateDestination(In, Qual);
1902 } else if (isa<CallInst>(In)) {
1903 Destination = In;
1904 Qual = HvxIdioms::Call;
1905 } else if (isa<GetElementPtrInst>(In)) {
1906 return locateDestination(In, Qual);
1907 } else if (isArithmetic(In->getOpcode())) {
1908 Destination = In;
1909 Qual = HvxIdioms::Arithmetic;
1910 } else {
1911 LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");
1912 }
1913 return Destination;
1914}
1915
1916// This method attempts to find destination (user) for a given intrinsic.
1917// Given that these are produced only by Ripple, the number of options is
1918// limited. Simplest case is explicit store which in fact is redundant (since
1919// HVX gater creates its own store during packetization). Nevertheless we need
1920// to figure address where we storing. Other cases are more complicated, but
1921// still few.
1922Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {
1923 Instruction *Destination = nullptr;
1924 if (!In)
1925 return Destination;
1926 // Get all possible destinations
1928 // Iterate over the uses of the instruction
1929 for (auto &U : In->uses()) {
1930 if (auto *UI = dyn_cast<Instruction>(U.getUser())) {
1931 Destination = selectDestination(UI, Qual);
1932 if (Destination)
1933 Users.push_back(Destination);
1934 }
1935 }
1936 // Now see which of the users (if any) is a memory destination.
1937 for (auto *I : Users)
1938 if (getPointer(I))
1939 return I;
1940 return Destination;
1941}
1942
1943// The two intrinsics we handle here have GEP in a different position.
1945 assert(In && "Bad instruction");
1947 assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||
1948 IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&
1949 "Not a gather Intrinsic");
1950 GetElementPtrInst *GEPIndex = nullptr;
1951 if (IIn->getIntrinsicID() == Intrinsic::masked_gather)
1952 GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0));
1953 else
1954 GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1));
1955 return GEPIndex;
1956}
1957
1958// Given the intrinsic find its GEP argument and extract base address it uses.
1959// The method relies on the way how Ripple typically forms the GEP for
1960// scatter/gather.
1963 if (!GEPIndex) {
1964 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
1965 return nullptr;
1966 }
1967 Value *BaseAddress = GEPIndex->getPointerOperand();
1968 auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress);
1969 if (IndexLoad)
1970 return IndexLoad;
1971
1972 auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress);
1973 if (IndexZEx) {
1974 IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0));
1975 if (IndexLoad)
1976 return IndexLoad;
1977 IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0));
1978 if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
1980 }
1981 auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress);
1982 if (BaseShuffle) {
1983 IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0));
1984 if (IndexLoad)
1985 return IndexLoad;
1986 auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0));
1987 if (IE) {
1988 auto *Src = IE->getOperand(1);
1989 IndexLoad = dyn_cast<LoadInst>(Src);
1990 if (IndexLoad)
1991 return IndexLoad;
1992 auto *Alloca = dyn_cast<AllocaInst>(Src);
1993 if (Alloca)
1994 return Alloca;
1995 if (isa<Argument>(Src)) {
1996 return Src;
1997 }
1998 if (isa<GlobalValue>(Src)) {
1999 return Src;
2000 }
2001 }
2002 }
2003 LLVM_DEBUG(dbgs() << " Unable to locate Address from intrinsic\n");
2004 return nullptr;
2005}
2006
2008 if (!In)
2009 return nullptr;
2010
2011 if (isa<LoadInst>(In) || isa<StoreInst>(In))
2012 return getLoadStoreType(In);
2013
2015 if (II->getIntrinsicID() == Intrinsic::masked_load)
2016 return II->getType();
2017 if (II->getIntrinsicID() == Intrinsic::masked_store)
2018 return II->getOperand(0)->getType();
2019 }
2020 return In->getType();
2021}
2022
2024 if (!In)
2025 return nullptr;
2026 if (isa<LoadInst>(In))
2027 return In;
2029 if (II->getIntrinsicID() == Intrinsic::masked_load)
2030 return In;
2031 if (II->getIntrinsicID() == Intrinsic::masked_gather)
2032 return In;
2033 }
2034 if (auto *IndexZEx = dyn_cast<ZExtInst>(In))
2035 return locateIndexesFromGEP(IndexZEx->getOperand(0));
2036 if (auto *IndexSEx = dyn_cast<SExtInst>(In))
2037 return locateIndexesFromGEP(IndexSEx->getOperand(0));
2038 if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In))
2039 return locateIndexesFromGEP(BaseShuffle->getOperand(0));
2040 if (auto *IE = dyn_cast<InsertElementInst>(In))
2041 return locateIndexesFromGEP(IE->getOperand(1));
2042 if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In))
2043 return cstDataVector;
2044 if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In))
2045 return GEPIndex->getOperand(0);
2046 return nullptr;
2047}
2048
2049// Given the intrinsic find its GEP argument and extract offsetts from the base
2050// address it uses.
2053 if (!GEPIndex) {
2054 LLVM_DEBUG(dbgs() << " No GEP in intrinsic\n");
2055 return nullptr;
2056 }
2057 Value *Indexes = GEPIndex->getOperand(1);
2058 if (auto *IndexLoad = locateIndexesFromGEP(Indexes))
2059 return IndexLoad;
2060
2061 LLVM_DEBUG(dbgs() << " Unable to locate Index from intrinsic\n");
2062 return nullptr;
2063}
2064
2065// Because of aukward definition of many Hex intrinsics we often have to
2066// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP
2067// for all use cases, so this only exist to make IR builder happy.
2068inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,
2069 IRBuilderBase &Builder,
2070 LLVMContext &Ctx, Value *I) {
2071 assert(I && "Unable to reinterprete cast");
2072 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2073 std::vector<unsigned> shuffleMask;
2074 for (unsigned i = 0; i < 64; ++i)
2075 shuffleMask.push_back(i);
2076 Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
2077 Value *CastShuffle =
2078 Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
2079 return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32");
2080}
2081
2082// Recast <128 x i8> as <32 x i32>
2083inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,
2084 IRBuilderBase &Builder,
2085 LLVMContext &Ctx, Value *I) {
2086 assert(I && "Unable to reinterprete cast");
2087 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2088 std::vector<unsigned> shuffleMask;
2089 for (unsigned i = 0; i < 128; ++i)
2090 shuffleMask.push_back(i);
2091 Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
2092 Value *CastShuffle =
2093 Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
2094 return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32");
2095}
2096
2097// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern
2098inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,
2099 IRBuilderBase &Builder, LLVMContext &Ctx,
2100 unsigned int pattern) {
2101 std::vector<unsigned int> byteMask;
2102 for (unsigned i = 0; i < 32; ++i)
2103 byteMask.push_back(pattern);
2104
2105 return Builder.CreateIntrinsic(
2106 HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt),
2107 {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)},
2108 nullptr);
2109}
2110
2111Value *HvxIdioms::processVScatter(Instruction &In) const {
2112 auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType());
2113 assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");
2114 unsigned InpSize = HVC.getSizeOf(InpTy);
2115 auto *F = In.getFunction();
2116 LLVMContext &Ctx = F->getContext();
2117 auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType());
2118 assert(ElemTy && "llvm.scatter needs integer type argument");
2119 unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy);
2120 LLVM_DEBUG({
2121 unsigned Elements = HVC.length(InpTy);
2122 dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n";
2123 dbgs() << " Input type(" << *InpTy << ") elements(" << Elements
2124 << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth("
2125 << ElemWidth << ")\n";
2126 });
2127
2128 IRBuilder Builder(In.getParent(), In.getIterator(),
2129 InstSimplifyFolder(HVC.DL));
2130
2131 auto *ValueToScatter = In.getOperand(0);
2132 LLVM_DEBUG(dbgs() << " ValueToScatter : " << *ValueToScatter << "\n");
2133
2134 if (HVC.HST.getVectorLength() != InpSize) {
2135 LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize
2136 << ") for vscatter\n");
2137 return nullptr;
2138 }
2139
2140 // Base address of indexes.
2141 auto *IndexLoad = locateAddressFromIntrinsic(&In);
2142 if (!IndexLoad)
2143 return nullptr;
2144 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2145
2146 // Address of destination. Must be in VTCM.
2147 auto *Ptr = getPointer(IndexLoad);
2148 if (!Ptr)
2149 return nullptr;
2150 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2151 // Indexes/offsets
2152 auto *Indexes = locateIndexesFromIntrinsic(&In);
2153 if (!Indexes)
2154 return nullptr;
2155 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2156 Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx),
2157 "cst_ptr_to_i32");
2158 LLVM_DEBUG(dbgs() << " CastedDst : " << *CastedDst << "\n");
2159 // Adjust Indexes
2160 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2161 Value *CastIndex = nullptr;
2162 if (cstDataVector) {
2163 // Our indexes are represented as a constant. We need it in a reg.
2164 AllocaInst *IndexesAlloca =
2165 Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false));
2166 [[maybe_unused]] auto *StoreIndexes =
2167 Builder.CreateStore(cstDataVector, IndexesAlloca);
2168 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2169 CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(),
2170 IndexesAlloca, "reload_index");
2171 } else {
2172 if (ElemWidth == 2)
2173 CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2174 else
2175 CastIndex = Indexes;
2176 }
2177 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2178
2179 if (ElemWidth == 1) {
2180 // v128i8 There is no native instruction for this.
2181 // Do this as two Hi/Lo gathers with masking.
2182 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2183 // Extend indexes. We assume that indexes are in 128i8 format - need to
2184 // expand them to Hi/Lo 64i16
2185 Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32");
2186 auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
2187 auto *UnpackedIndexes = Builder.CreateIntrinsic(
2188 HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr);
2189 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes << ")\n");
2190
2191 auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
2192 auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
2193 [[maybe_unused]] Value *IndexHi =
2194 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
2195 [[maybe_unused]] Value *IndexLo =
2196 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
2197 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2198 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2199 // Now unpack values to scatter
2200 Value *CastSrc =
2201 getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter);
2202 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2203 auto *UnpackedValueToScatter = Builder.CreateIntrinsic(
2204 HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr);
2205 LLVM_DEBUG(dbgs() << " UnpackedValToScat: " << *UnpackedValueToScatter
2206 << ")\n");
2207
2208 [[maybe_unused]] Value *UVSHi =
2209 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter);
2210 [[maybe_unused]] Value *UVSLo =
2211 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter);
2212 LLVM_DEBUG(dbgs() << " UVSHi : " << *UVSHi << ")\n");
2213 LLVM_DEBUG(dbgs() << " UVSLo : " << *UVSLo << ")\n");
2214
2215 // Create the mask for individual bytes
2216 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
2217 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2218 [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic(
2219 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
2220 {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2221 IndexHi, UVSHi},
2222 nullptr);
2223 LLVM_DEBUG(dbgs() << " ResHi : " << *ResHi << ")\n");
2224 return Builder.CreateIntrinsic(
2225 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
2226 {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2227 IndexLo, UVSLo},
2228 nullptr);
2229 } else if (ElemWidth == 2) {
2230 Value *CastSrc =
2231 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter);
2232 LLVM_DEBUG(dbgs() << " CastSrc : " << *CastSrc << ")\n");
2233 return Builder.CreateIntrinsic(
2234 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B,
2235 {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2236 CastSrc},
2237 nullptr);
2238 } else if (ElemWidth == 4) {
2239 return Builder.CreateIntrinsic(
2240 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B,
2241 {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
2242 ValueToScatter},
2243 nullptr);
2244 } else {
2245 LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");
2246 return nullptr;
2247 }
2248}
2249
2250Value *HvxIdioms::processVGather(Instruction &In) const {
2251 [[maybe_unused]] auto *InpTy =
2252 dyn_cast<VectorType>(In.getOperand(0)->getType());
2253 assert(InpTy && "Cannot handle no vector type for llvm.gather");
2254 [[maybe_unused]] auto *ElemTy =
2255 dyn_cast<PointerType>(InpTy->getElementType());
2256 assert(ElemTy && "llvm.gather needs vector of ptr argument");
2257 auto *F = In.getFunction();
2258 LLVMContext &Ctx = F->getContext();
2259 LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"
2260 << *In.getParent() << "\n");
2261 LLVM_DEBUG(dbgs() << " Input type(" << *InpTy << ") elements("
2262 << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
2263 << ") type(" << *ElemTy << ") Access alignment("
2264 << *In.getOperand(1) << ") AddressSpace("
2265 << ElemTy->getAddressSpace() << ")\n");
2266
2267 // TODO: Handle masking of elements.
2268 assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) &&
2269 "llvm.gather needs vector for mask");
2270 IRBuilder Builder(In.getParent(), In.getIterator(),
2271 InstSimplifyFolder(HVC.DL));
2272
2273 // See who is using the result. The difference between LLVM and HVX vgather
2274 // Intrinsic makes it impossible to handle all cases with temp storage. Alloca
2275 // in VTCM is not yet supported, so for now we just bail out for those cases.
2276 HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;
2277 Instruction *Dst = locateDestination(&In, Qual);
2278 if (!Dst) {
2279 LLVM_DEBUG(dbgs() << " Unable to locate vgather destination\n");
2280 return nullptr;
2281 }
2282 LLVM_DEBUG(dbgs() << " Destination : " << *Dst << " Qual(" << Qual
2283 << ")\n");
2284
2285 // Address of destination. Must be in VTCM.
2286 auto *Ptr = getPointer(Dst);
2287 if (!Ptr) {
2288 LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");
2289 return nullptr;
2290 }
2291
2292 // Result type. Assume it is a vector type.
2293 auto *DstType = cast<VectorType>(getIndexType(Dst));
2294 assert(DstType && "Cannot handle non vector dst type for llvm.gather");
2295
2296 // Base address for sources to be loaded
2297 auto *IndexLoad = locateAddressFromIntrinsic(&In);
2298 if (!IndexLoad)
2299 return nullptr;
2300 LLVM_DEBUG(dbgs() << " IndexLoad : " << *IndexLoad << "\n");
2301
2302 // Gather indexes/offsets
2303 auto *Indexes = locateIndexesFromIntrinsic(&In);
2304 if (!Indexes)
2305 return nullptr;
2306 LLVM_DEBUG(dbgs() << " Indexes : " << *Indexes << "\n");
2307
2308 Instruction *Gather = nullptr;
2309 Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
2310 if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {
2311 // We fully assume the address space is in VTCM. We also assume that all
2312 // pointers in Operand(0) have the same base(!).
2313 // This is the most basic case of all the above.
2314 unsigned OutputSize = HVC.getSizeOf(DstType);
2315 auto *DstElemTy = cast<IntegerType>(DstType->getElementType());
2316 unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy);
2317 LLVM_DEBUG(dbgs() << " Buffer type : " << *Ptr->getType()
2318 << " Address space ("
2319 << Ptr->getType()->getPointerAddressSpace() << ")\n"
2320 << " Result type : " << *DstType
2321 << "\n Size in bytes : " << OutputSize
2322 << " element type(" << *DstElemTy
2323 << ")\n ElemWidth : " << ElemWidth << " bytes\n");
2324
2325 auto *IndexType = cast<VectorType>(getIndexType(Indexes));
2326 assert(IndexType && "Cannot handle non vector index type for llvm.gather");
2327 unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType());
2328 LLVM_DEBUG(dbgs() << " IndexWidth(" << IndexWidth << ")\n");
2329
2330 // Intrinsic takes i32 instead of pointer so cast.
2331 Value *CastedPtr = Builder.CreateBitOrPointerCast(
2332 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2333 // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]
2334 // int_hexagon_V6_vgathermh [... , llvm_v16i32_ty]
2335 // int_hexagon_V6_vgathermh_128B [... , llvm_v32i32_ty]
2336 // int_hexagon_V6_vgathermhw [... , llvm_v32i32_ty]
2337 // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]
2338 // int_hexagon_V6_vgathermw [... , llvm_v16i32_ty]
2339 // int_hexagon_V6_vgathermw_128B [... , llvm_v32i32_ty]
2340 if (HVC.HST.getVectorLength() == OutputSize) {
2341 if (ElemWidth == 1) {
2342 // v128i8 There is no native instruction for this.
2343 // Do this as two Hi/Lo gathers with masking.
2344 // Unpack indexes. We assume that indexes are in 128i8 format - need to
2345 // expand them to Hi/Lo 64i16
2346 Value *CastIndexes =
2347 Builder.CreateBitCast(Indexes, NT, "cast_to_32i32");
2348 auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
2349 auto *UnpackedIndexes =
2350 Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true),
2351 V6_vunpack, CastIndexes, nullptr);
2352 LLVM_DEBUG(dbgs() << " UnpackedIndexes : " << *UnpackedIndexes
2353 << ")\n");
2354
2355 auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
2356 auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
2357 [[maybe_unused]] Value *IndexHi =
2358 HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
2359 [[maybe_unused]] Value *IndexLo =
2360 HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
2361 LLVM_DEBUG(dbgs() << " UnpackedIndHi : " << *IndexHi << ")\n");
2362 LLVM_DEBUG(dbgs() << " UnpackedIndLo : " << *IndexLo << ")\n");
2363 // Create the mask for individual bytes
2364 auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
2365 LLVM_DEBUG(dbgs() << " QByteMask : " << *QByteMask << "\n");
2366 // We use our destination allocation as a temp storage
2367 // This is unlikely to work properly for masked gather.
2368 auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq);
2369 [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic(
2370 Type::getVoidTy(Ctx), V6_vgather,
2371 {Ptr, QByteMask, CastedPtr,
2372 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},
2373 nullptr);
2374 LLVM_DEBUG(dbgs() << " GatherHi : " << *GatherHi << ")\n");
2375 // Rematerialize the result
2376 [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad(
2377 HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi");
2378 LLVM_DEBUG(dbgs() << " LoadedResultHi : " << *LoadedResultHi << "\n");
2379 // Same for the low part. Here we use Gather to return non-NULL result
2380 // from this function and continue to iterate. We also are deleting Dst
2381 // store below.
2382 Gather = Builder.CreateIntrinsic(
2383 Type::getVoidTy(Ctx), V6_vgather,
2384 {Ptr, QByteMask, CastedPtr,
2385 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},
2386 nullptr);
2387 LLVM_DEBUG(dbgs() << " GatherLo : " << *Gather << ")\n");
2388 Value *LoadedResultLo = Builder.CreateLoad(
2389 HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo");
2390 LLVM_DEBUG(dbgs() << " LoadedResultLo : " << *LoadedResultLo << "\n");
2391 // Now we have properly sized bytes in every other position
2392 // B b A a c a A b B c f F g G h H is presented as
2393 // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H
2394 // Use vpack to gather them
2395 auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb);
2396 [[maybe_unused]] auto Res = Builder.CreateIntrinsic(
2397 NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr);
2398 LLVM_DEBUG(dbgs() << " ScaledRes : " << *Res << "\n");
2399 [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr);
2400 LLVM_DEBUG(dbgs() << " StoreRes : " << *StoreRes << "\n");
2401 } else if (ElemWidth == 2) {
2402 // v32i16
2403 if (IndexWidth == 2) {
2404 // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.
2405 Value *CastIndex =
2406 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2407 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2408 // shift all i16 left by 1 to match short addressing mode instead of
2409 // byte.
2410 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2411 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2412 Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
2414 << " Shifted half index: " << *AdjustedIndex << ")\n");
2415
2416 auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh);
2417 // The 3rd argument is the size of the region to gather from. Probably
2418 // want to set it to max VTCM size.
2419 Gather = Builder.CreateIntrinsic(
2420 Type::getVoidTy(Ctx), V6_vgather,
2421 {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2422 AdjustedIndex},
2423 nullptr);
2424 for (auto &U : Dst->uses()) {
2425 if (auto *UI = dyn_cast<Instruction>(U.getUser()))
2426 dbgs() << " dst used by: " << *UI << "\n";
2427 }
2428 for (auto &U : In.uses()) {
2429 if (auto *UI = dyn_cast<Instruction>(U.getUser()))
2430 dbgs() << " In used by : " << *UI << "\n";
2431 }
2432 // Create temp load from result in case the result is used by any
2433 // other instruction.
2434 Value *LoadedResult = Builder.CreateLoad(
2435 HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result");
2436 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2437 In.replaceAllUsesWith(LoadedResult);
2438 } else {
2439 dbgs() << " Unhandled index type for vgather\n";
2440 return nullptr;
2441 }
2442 } else if (ElemWidth == 4) {
2443 if (IndexWidth == 4) {
2444 // v32i32
2445 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2446 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2447 Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)});
2449 << " Shifted word index: " << *AdjustedIndex << ")\n");
2450 Gather = Builder.CreateIntrinsic(
2451 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B,
2452 {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2453 AdjustedIndex},
2454 nullptr);
2455 } else {
2456 LLVM_DEBUG(dbgs() << " Unhandled index type for vgather\n");
2457 return nullptr;
2458 }
2459 } else {
2460 LLVM_DEBUG(dbgs() << " Unhandled element type for vgather\n");
2461 return nullptr;
2462 }
2463 } else if (HVC.HST.getVectorLength() == OutputSize * 2) {
2464 // This is half of the reg width, duplicate low in high
2465 LLVM_DEBUG(dbgs() << " Unhandled half of register size\n");
2466 return nullptr;
2467 } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {
2468 LLVM_DEBUG(dbgs() << " Unhandle twice the register size\n");
2469 return nullptr;
2470 }
2471 // Erase the original intrinsic and store that consumes it.
2472 // HVX will create a pseudo for gather that is expanded to gather + store
2473 // during packetization.
2474 Dst->eraseFromParent();
2475 } else if (Qual == HvxIdioms::LLVM_Scatter) {
2476 // Gather feeds directly into scatter.
2477 auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
2478 assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
2479 [[maybe_unused]] unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
2480 [[maybe_unused]] unsigned DstElements = HVC.length(DstInpTy);
2481 [[maybe_unused]] auto *DstElemTy =
2482 cast<PointerType>(DstInpTy->getElementType());
2483 assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
2484 LLVM_DEBUG(dbgs() << " Gather feeds into scatter\n Values to scatter : "
2485 << *Dst->getOperand(0) << "\n");
2486 LLVM_DEBUG(dbgs() << " Dst type(" << *DstInpTy << ") elements("
2487 << DstElements << ") VecLen(" << DstInpSize << ") type("
2488 << *DstElemTy << ") Access alignment("
2489 << *Dst->getOperand(2) << ")\n");
2490 // Address of source
2491 auto *Src = getPointer(IndexLoad);
2492 if (!Src)
2493 return nullptr;
2494 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2495
2496 if (!isa<PointerType>(Src->getType())) {
2497 LLVM_DEBUG(dbgs() << " Source is not a pointer type...\n");
2498 return nullptr;
2499 }
2500
2501 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2502 Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2503 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2504
2505 auto *DstLoad = locateAddressFromIntrinsic(Dst);
2506 if (!DstLoad) {
2507 LLVM_DEBUG(dbgs() << " Unable to locate DstLoad\n");
2508 return nullptr;
2509 }
2510 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2511
2512 Value *Ptr = getPointer(DstLoad);
2513 if (!Ptr)
2514 return nullptr;
2515 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2516 Value *CastIndex =
2517 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad);
2518 LLVM_DEBUG(dbgs() << " Cast index: " << *CastIndex << ")\n");
2519 // Shift all i16 left by 1 to match short addressing mode instead of
2520 // byte.
2521 auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
2522 Value *AdjustedIndex = HVC.createHvxIntrinsic(
2523 Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
2524 LLVM_DEBUG(dbgs() << " Shifted half index: " << *AdjustedIndex << ")\n");
2525
2526 return Builder.CreateIntrinsic(
2527 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2528 {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2529 AdjustedIndex},
2530 nullptr);
2531 } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {
2532 // Gather feeds into previously inserted pseudo intrinsic.
2533 // These could not be in the same packet, so we need to generate another
2534 // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo
2535 // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,
2536 // ModRegs:$Mu, HvxVR:$Vv)
2537 if (isa<AllocaInst>(IndexLoad)) {
2538 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2539 if (cstDataVector) {
2540 // Our indexes are represented as a constant. We need THEM in a reg.
2541 // This most likely will not work properly since alloca gives us DDR
2542 // stack location. This will be fixed once we teach compiler about VTCM.
2543 AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
2544 [[maybe_unused]] auto *StoreIndexes =
2545 Builder.CreateStore(cstDataVector, IndexesAlloca);
2546 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2547 Value *LoadedIndex = Builder.CreateLoad(
2548 IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
2549 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2550 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca << "\n");
2551
2552 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2553 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2554 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2555
2556 Gather = Builder.CreateIntrinsic(
2557 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2558 {ResultAlloca, CastedSrc,
2559 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2560 nullptr);
2561 Value *LoadedResult = Builder.CreateLoad(
2562 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2563 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2564 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2565 In.replaceAllUsesWith(LoadedResult);
2566 }
2567 } else {
2568 // Address of source
2569 auto *Src = getPointer(IndexLoad);
2570 if (!Src)
2571 return nullptr;
2572 LLVM_DEBUG(dbgs() << " Src : " << *Src << "\n");
2573
2574 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2575 Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2576 LLVM_DEBUG(dbgs() << " CastedSrc: " << *CastedSrc << "\n");
2577
2578 auto *DstLoad = locateAddressFromIntrinsic(Dst);
2579 if (!DstLoad)
2580 return nullptr;
2581 LLVM_DEBUG(dbgs() << " DstLoad : " << *DstLoad << "\n");
2582 auto *Ptr = getPointer(DstLoad);
2583 if (!Ptr)
2584 return nullptr;
2585 LLVM_DEBUG(dbgs() << " Ptr : " << *Ptr << "\n");
2586
2587 Gather = Builder.CreateIntrinsic(
2588 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh,
2589 {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2590 Indexes},
2591 nullptr);
2592 }
2593 return Gather;
2594 } else if (Qual == HvxIdioms::HEX_Scatter) {
2595 // This is the case when result of a gather is used as an argument to
2596 // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it
2597 // ourselves. We have to create alloca, store to it, and replace all uses
2598 // with that.
2599 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2600 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2601 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2602 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2603 Value *CastIndex =
2604 getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
2605 LLVM_DEBUG(dbgs() << " Cast index : " << *CastIndex << ")\n");
2606
2607 Gather = Builder.CreateIntrinsic(
2608 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2609 {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
2610 CastIndex},
2611 nullptr);
2612 Value *LoadedResult = Builder.CreateLoad(
2613 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2614 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2615 In.replaceAllUsesWith(LoadedResult);
2616 } else if (Qual == HvxIdioms::HEX_Gather) {
2617 // Gather feeds to another gather but already replaced with
2618 // hexagon_V6_vgathermh_128B
2619 if (isa<AllocaInst>(IndexLoad)) {
2620 auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
2621 if (cstDataVector) {
2622 // Our indexes are represented as a constant. We need it in a reg.
2623 AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
2624
2625 [[maybe_unused]] auto *StoreIndexes =
2626 Builder.CreateStore(cstDataVector, IndexesAlloca);
2627 LLVM_DEBUG(dbgs() << " StoreIndexes : " << *StoreIndexes << "\n");
2628 Value *LoadedIndex = Builder.CreateLoad(
2629 IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
2630 AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
2631 LLVM_DEBUG(dbgs() << " ResultAlloca : " << *ResultAlloca
2632 << "\n AddressSpace: "
2633 << ResultAlloca->getAddressSpace() << "\n";);
2634
2635 Value *CastedSrc = Builder.CreateBitOrPointerCast(
2636 IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
2637 LLVM_DEBUG(dbgs() << " CastedSrc : " << *CastedSrc << "\n");
2638
2639 Gather = Builder.CreateIntrinsic(
2640 Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
2641 {ResultAlloca, CastedSrc,
2642 HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
2643 nullptr);
2644 Value *LoadedResult = Builder.CreateLoad(
2645 HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
2646 LLVM_DEBUG(dbgs() << " LoadedResult : " << *LoadedResult << "\n");
2647 LLVM_DEBUG(dbgs() << " Gather : " << *Gather << "\n");
2648 In.replaceAllUsesWith(LoadedResult);
2649 }
2650 }
2651 } else if (Qual == HvxIdioms::LLVM_Gather) {
2652 // Gather feeds into another gather
2653 errs() << " Underimplemented vgather to vgather sequence\n";
2654 return nullptr;
2655 } else
2656 llvm_unreachable("Unhandled Qual enum");
2657
2658 return Gather;
2659}
2660
2661auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
2662 const FxpOp &Op) const -> Value * {
2663 assert(Op.X.Val->getType() == Op.Y.Val->getType());
2664 auto *InpTy = cast<VectorType>(Op.X.Val->getType());
2665 unsigned Width = InpTy->getScalarSizeInBits();
2666 bool Rounding = Op.RoundAt.has_value();
2667
2668 if (!Op.RoundAt || *Op.RoundAt == Op.Frac - 1) {
2669 // The fixed-point intrinsics do signed multiplication.
2670 if (Width == Op.Frac + 1 && Op.X.Sgn != Unsigned && Op.Y.Sgn != Unsigned) {
2671 Value *QMul = nullptr;
2672 if (Width == 16) {
2673 QMul = createMulQ15(Builder, Op.X, Op.Y, Rounding);
2674 } else if (Width == 32) {
2675 QMul = createMulQ31(Builder, Op.X, Op.Y, Rounding);
2676 }
2677 if (QMul != nullptr)
2678 return QMul;
2679 }
2680 }
2681
2682 assert(Width >= 32 || isPowerOf2_32(Width)); // Width <= 32 => Width is 2^n
2683 assert(Width < 32 || Width % 32 == 0); // Width > 32 => Width is 32*k
2684
2685 // If Width < 32, then it should really be 16.
2686 if (Width < 32) {
2687 if (Width < 16)
2688 return nullptr;
2689 // Getting here with Op.Frac == 0 isn't wrong, but suboptimal: here we
2690 // generate a full precision products, which is unnecessary if there is
2691 // no shift.
2692 assert(Width == 16);
2693 assert(Op.Frac != 0 && "Unshifted mul should have been skipped");
2694 if (Op.Frac == 16) {
2695 // Multiply high
2696 if (Value *MulH = createMulH16(Builder, Op.X, Op.Y))
2697 return MulH;
2698 }
2699 // Do full-precision multiply and shift.
2700 Value *Prod32 = createMul16(Builder, Op.X, Op.Y);
2701 if (Rounding) {
2702 Value *RoundVal = HVC.getConstSplat(Prod32->getType(), 1 << *Op.RoundAt);
2703 Prod32 = Builder.CreateAdd(Prod32, RoundVal, "add");
2704 }
2705
2706 Value *ShiftAmt = HVC.getConstSplat(Prod32->getType(), Op.Frac);
2707 Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
2708 ? Builder.CreateAShr(Prod32, ShiftAmt, "asr")
2709 : Builder.CreateLShr(Prod32, ShiftAmt, "lsr");
2710 return Builder.CreateTrunc(Shifted, InpTy, "trn");
2711 }
2712
2713 // Width >= 32
2714
2715 // Break up the arguments Op.X and Op.Y into vectors of smaller widths
2716 // in preparation of doing the multiplication by 32-bit parts.
2717 auto WordX = HVC.splitVectorElements(Builder, Op.X.Val, /*ToWidth=*/32);
2718 auto WordY = HVC.splitVectorElements(Builder, Op.Y.Val, /*ToWidth=*/32);
2719 auto WordP = createMulLong(Builder, WordX, Op.X.Sgn, WordY, Op.Y.Sgn);
2720
2721 auto *HvxWordTy = cast<VectorType>(WordP.front()->getType());
2722
2723 // Add the optional rounding to the proper word.
2724 if (Op.RoundAt.has_value()) {
2725 Value *Zero = HVC.getNullValue(WordX[0]->getType());
2726 SmallVector<Value *> RoundV(WordP.size(), Zero);
2727 RoundV[*Op.RoundAt / 32] =
2728 HVC.getConstSplat(HvxWordTy, 1 << (*Op.RoundAt % 32));
2729 WordP = createAddLong(Builder, WordP, RoundV);
2730 }
2731
2732 // createRightShiftLong?
2733
2734 // Shift all products right by Op.Frac.
2735 unsigned SkipWords = Op.Frac / 32;
2736 Constant *ShiftAmt = HVC.getConstSplat(HvxWordTy, Op.Frac % 32);
2737
2738 for (int Dst = 0, End = WordP.size() - SkipWords; Dst != End; ++Dst) {
2739 int Src = Dst + SkipWords;
2740 Value *Lo = WordP[Src];
2741 if (Src + 1 < End) {
2742 Value *Hi = WordP[Src + 1];
2743 WordP[Dst] = Builder.CreateIntrinsic(HvxWordTy, Intrinsic::fshr,
2744 {Hi, Lo, ShiftAmt},
2745 /*FMFSource*/ nullptr, "int");
2746 } else {
2747 // The shift of the most significant word.
2748 WordP[Dst] = Builder.CreateAShr(Lo, ShiftAmt, "asr");
2749 }
2750 }
2751 if (SkipWords != 0)
2752 WordP.resize(WordP.size() - SkipWords);
2753
2754 return HVC.joinVectorElements(Builder, WordP, Op.ResTy);
2755}
2756
2757auto HvxIdioms::createMulQ15(IRBuilderBase &Builder, SValue X, SValue Y,
2758 bool Rounding) const -> Value * {
2759 assert(X.Val->getType() == Y.Val->getType());
2760 assert(X.Val->getType()->getScalarType() == HVC.getIntTy(16));
2761 assert(HVC.HST.isHVXVectorType(EVT::getEVT(X.Val->getType(), false)));
2762
2763 // There is no non-rounding intrinsic for i16.
2764 if (!Rounding || X.Sgn == Unsigned || Y.Sgn == Unsigned)
2765 return nullptr;
2766
2767 auto V6_vmpyhvsrs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhvsrs);
2768 return HVC.createHvxIntrinsic(Builder, V6_vmpyhvsrs, X.Val->getType(),
2769 {X.Val, Y.Val});
2770}
2771
2772auto HvxIdioms::createMulQ31(IRBuilderBase &Builder, SValue X, SValue Y,
2773 bool Rounding) const -> Value * {
2774 Type *InpTy = X.Val->getType();
2775 assert(InpTy == Y.Val->getType());
2776 assert(InpTy->getScalarType() == HVC.getIntTy(32));
2777 assert(HVC.HST.isHVXVectorType(EVT::getEVT(InpTy, false)));
2778
2779 if (X.Sgn == Unsigned || Y.Sgn == Unsigned)
2780 return nullptr;
2781
2782 auto V6_vmpyewuh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyewuh);
2783 auto V6_vmpyo_acc = Rounding
2784 ? HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_rnd_sacc)
2785 : HVC.HST.getIntrinsicId(Hexagon::V6_vmpyowh_sacc);
2786 Value *V1 =
2787 HVC.createHvxIntrinsic(Builder, V6_vmpyewuh, InpTy, {X.Val, Y.Val});
2788 return HVC.createHvxIntrinsic(Builder, V6_vmpyo_acc, InpTy,
2789 {V1, X.Val, Y.Val});
2790}
2791
2792auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
2793 Value *CarryIn) const
2794 -> std::pair<Value *, Value *> {
2795 assert(X->getType() == Y->getType());
2796 auto VecTy = cast<VectorType>(X->getType());
2797 if (VecTy == HvxI32Ty && HVC.HST.useHVXV62Ops()) {
2799 Intrinsic::ID AddCarry;
2800 if (CarryIn == nullptr && HVC.HST.useHVXV66Ops()) {
2801 AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarryo);
2802 } else {
2803 AddCarry = HVC.HST.getIntrinsicId(Hexagon::V6_vaddcarry);
2804 if (CarryIn == nullptr)
2805 CarryIn = HVC.getNullValue(HVC.getBoolTy(HVC.length(VecTy)));
2806 Args.push_back(CarryIn);
2807 }
2808 Value *Ret = HVC.createHvxIntrinsic(Builder, AddCarry,
2809 /*RetTy=*/nullptr, Args);
2810 Value *Result = Builder.CreateExtractValue(Ret, {0}, "ext");
2811 Value *CarryOut = Builder.CreateExtractValue(Ret, {1}, "ext");
2812 return {Result, CarryOut};
2813 }
2814
2815 // In other cases, do a regular add, and unsigned compare-less-than.
2816 // The carry-out can originate in two places: adding the carry-in or adding
2817 // the two input values.
2818 Value *Result1 = X; // Result1 = X + CarryIn
2819 if (CarryIn != nullptr) {
2820 unsigned Width = VecTy->getScalarSizeInBits();
2821 uint32_t Mask = 1;
2822 if (Width < 32) {
2823 for (unsigned i = 0, e = 32 / Width; i != e; ++i)
2824 Mask = (Mask << Width) | 1;
2825 }
2826 auto V6_vandqrt = HVC.HST.getIntrinsicId(Hexagon::V6_vandqrt);
2827 Value *ValueIn =
2828 HVC.createHvxIntrinsic(Builder, V6_vandqrt, /*RetTy=*/nullptr,
2829 {CarryIn, HVC.getConstInt(Mask)});
2830 Result1 = Builder.CreateAdd(X, ValueIn, "add");
2831 }
2832
2833 Value *CarryOut1 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result1, X, "cmp");
2834 Value *Result2 = Builder.CreateAdd(Result1, Y, "add");
2835 Value *CarryOut2 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result2, Y, "cmp");
2836 return {Result2, Builder.CreateOr(CarryOut1, CarryOut2, "orb")};
2837}
2838
2839auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const
2840 -> Value * {
2841 Intrinsic::ID V6_vmpyh = 0;
2842 std::tie(X, Y) = canonSgn(X, Y);
2843
2844 if (X.Sgn == Signed) {
2845 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhv);
2846 } else if (Y.Sgn == Signed) {
2847 // In vmpyhus the second operand is unsigned
2848 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyhus);
2849 } else {
2850 V6_vmpyh = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhv);
2851 }
2852
2853 // i16*i16 -> i32 / interleaved
2854 Value *P =
2855 HVC.createHvxIntrinsic(Builder, V6_vmpyh, HvxP32Ty, {Y.Val, X.Val});
2856 // Deinterleave
2857 return HVC.vshuff(Builder, HVC.sublo(Builder, P), HVC.subhi(Builder, P));
2858}
2859
2860auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
2861 -> Value * {
2862 Type *HvxI16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/false);
2863
2864 if (HVC.HST.useHVXV69Ops()) {
2865 if (X.Sgn != Signed && Y.Sgn != Signed) {
2866 auto V6_vmpyuhvs = HVC.HST.getIntrinsicId(Hexagon::V6_vmpyuhvs);
2867 return HVC.createHvxIntrinsic(Builder, V6_vmpyuhvs, HvxI16Ty,
2868 {X.Val, Y.Val});
2869 }
2870 }
2871
2872 Type *HvxP16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/true);
2873 Value *Pair16 =
2874 Builder.CreateBitCast(createMul16(Builder, X, Y), HvxP16Ty, "cst");
2875 unsigned Len = HVC.length(HvxP16Ty) / 2;
2876
2877 SmallVector<int, 128> PickOdd(Len);
2878 for (int i = 0; i != static_cast<int>(Len); ++i)
2879 PickOdd[i] = 2 * i + 1;
2880
2881 return Builder.CreateShuffleVector(
2882 HVC.sublo(Builder, Pair16), HVC.subhi(Builder, Pair16), PickOdd, "shf");
2883}
2884
2885auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
2886 -> std::pair<Value *, Value *> {
2887 assert(X.Val->getType() == Y.Val->getType());
2888 assert(X.Val->getType() == HvxI32Ty);
2889
2890 Intrinsic::ID V6_vmpy_parts;
2891 std::tie(X, Y) = canonSgn(X, Y);
2892
2893 if (X.Sgn == Signed) {
2894 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyss_parts;
2895 } else if (Y.Sgn == Signed) {
2896 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyus_parts;
2897 } else {
2898 V6_vmpy_parts = Intrinsic::hexagon_V6_vmpyuu_parts;
2899 }
2900
2901 Value *Parts = HVC.createHvxIntrinsic(Builder, V6_vmpy_parts, nullptr,
2902 {X.Val, Y.Val}, {HvxI32Ty});
2903 Value *Hi = Builder.CreateExtractValue(Parts, {0}, "ext");
2904 Value *Lo = Builder.CreateExtractValue(Parts, {1}, "ext");
2905 return {Lo, Hi};
2906}
2907
2908auto HvxIdioms::createAddLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
2909 ArrayRef<Value *> WordY) const
2911 assert(WordX.size() == WordY.size());
2912 unsigned Idx = 0, Length = WordX.size();
2914
2915 while (Idx != Length) {
2916 if (HVC.isZero(WordX[Idx]))
2917 Sum[Idx] = WordY[Idx];
2918 else if (HVC.isZero(WordY[Idx]))
2919 Sum[Idx] = WordX[Idx];
2920 else
2921 break;
2922 ++Idx;
2923 }
2924
2925 Value *Carry = nullptr;
2926 for (; Idx != Length; ++Idx) {
2927 std::tie(Sum[Idx], Carry) =
2928 createAddCarry(Builder, WordX[Idx], WordY[Idx], Carry);
2929 }
2930
2931 // This drops the final carry beyond the highest word.
2932 return Sum;
2933}
2934
2935auto HvxIdioms::createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
2936 Signedness SgnX, ArrayRef<Value *> WordY,
2937 Signedness SgnY) const -> SmallVector<Value *> {
2938 SmallVector<SmallVector<Value *>> Products(WordX.size() + WordY.size());
2939
2940 // WordX[i] * WordY[j] produces words i+j and i+j+1 of the results,
2941 // that is halves 2(i+j), 2(i+j)+1, 2(i+j)+2, 2(i+j)+3.
2942 for (int i = 0, e = WordX.size(); i != e; ++i) {
2943 for (int j = 0, f = WordY.size(); j != f; ++j) {
2944 // Check the 4 halves that this multiplication can generate.
2945 Signedness SX = (i + 1 == e) ? SgnX : Unsigned;
2946 Signedness SY = (j + 1 == f) ? SgnY : Unsigned;
2947 auto [Lo, Hi] = createMul32(Builder, {WordX[i], SX}, {WordY[j], SY});
2948 Products[i + j + 0].push_back(Lo);
2949 Products[i + j + 1].push_back(Hi);
2950 }
2951 }
2952
2953 Value *Zero = HVC.getNullValue(WordX[0]->getType());
2954
2955 auto pop_back_or_zero = [Zero](auto &Vector) -> Value * {
2956 if (Vector.empty())
2957 return Zero;
2958 auto Last = Vector.back();
2959 Vector.pop_back();
2960 return Last;
2961 };
2962
2963 for (int i = 0, e = Products.size(); i != e; ++i) {
2964 while (Products[i].size() > 1) {
2965 Value *Carry = nullptr; // no carry-in
2966 for (int j = i; j != e; ++j) {
2967 auto &ProdJ = Products[j];
2968 auto [Sum, CarryOut] = createAddCarry(Builder, pop_back_or_zero(ProdJ),
2969 pop_back_or_zero(ProdJ), Carry);
2970 ProdJ.insert(ProdJ.begin(), Sum);
2971 Carry = CarryOut;
2972 }
2973 }
2974 }
2975
2977 for (auto &P : Products) {
2978 assert(P.size() == 1 && "Should have been added together");
2979 WordP.push_back(P.front());
2980 }
2981
2982 return WordP;
2983}
2984
2985auto HvxIdioms::run() -> bool {
2986 bool Changed = false;
2987
2988 for (BasicBlock &B : HVC.F) {
2989 for (auto It = B.rbegin(); It != B.rend(); ++It) {
2990 if (auto Fxm = matchFxpMul(*It)) {
2991 Value *New = processFxpMul(*It, *Fxm);
2992 // Always report "changed" for now.
2993 Changed = true;
2994 if (!New)
2995 continue;
2996 bool StartOver = !isa<Instruction>(New);
2997 It->replaceAllUsesWith(New);
2999 It = StartOver ? B.rbegin()
3000 : cast<Instruction>(New)->getReverseIterator();
3001 Changed = true;
3002 } else if (matchGather(*It)) {
3003 Value *New = processVGather(*It);
3004 if (!New)
3005 continue;
3006 LLVM_DEBUG(dbgs() << " Gather : " << *New << "\n");
3007 // We replace original intrinsic with a new pseudo call.
3008 It->eraseFromParent();
3009 It = cast<Instruction>(New)->getReverseIterator();
3011 Changed = true;
3012 } else if (matchScatter(*It)) {
3013 Value *New = processVScatter(*It);
3014 if (!New)
3015 continue;
3016 LLVM_DEBUG(dbgs() << " Scatter : " << *New << "\n");
3017 // We replace original intrinsic with a new pseudo call.
3018 It->eraseFromParent();
3019 It = cast<Instruction>(New)->getReverseIterator();
3021 Changed = true;
3022 }
3023 }
3024 }
3025
3026 return Changed;
3027}
3028
3029// --- End HvxIdioms
3030
3031auto HexagonVectorCombine::run() -> bool {
3032 if (DumpModule)
3033 dbgs() << "Module before HexagonVectorCombine\n" << *F.getParent();
3034
3035 bool Changed = false;
3036 if (HST.useHVXOps()) {
3037 if (VAEnabled)
3038 Changed |= AlignVectors(*this).run();
3039 if (VIEnabled)
3040 Changed |= HvxIdioms(*this).run();
3041 }
3042
3043 if (DumpModule) {
3044 dbgs() << "Module " << (Changed ? "(modified)" : "(unchanged)")
3045 << " after HexagonVectorCombine\n"
3046 << *F.getParent();
3047 }
3048 return Changed;
3049}
3050
3051auto HexagonVectorCombine::getIntTy(unsigned Width) const -> IntegerType * {
3052 return IntegerType::get(F.getContext(), Width);
3053}
3054
3055auto HexagonVectorCombine::getByteTy(int ElemCount) const -> Type * {
3056 assert(ElemCount >= 0);
3057 IntegerType *ByteTy = Type::getInt8Ty(F.getContext());
3058 if (ElemCount == 0)
3059 return ByteTy;
3060 return VectorType::get(ByteTy, ElemCount, /*Scalable=*/false);
3061}
3062
3063auto HexagonVectorCombine::getBoolTy(int ElemCount) const -> Type * {
3064 assert(ElemCount >= 0);
3065 IntegerType *BoolTy = Type::getInt1Ty(F.getContext());
3066 if (ElemCount == 0)
3067 return BoolTy;
3068 return VectorType::get(BoolTy, ElemCount, /*Scalable=*/false);
3069}
3070
3071auto HexagonVectorCombine::getConstInt(int Val, unsigned Width) const
3072 -> ConstantInt * {
3073 return ConstantInt::getSigned(getIntTy(Width), Val);
3074}
3075
3076auto HexagonVectorCombine::isZero(const Value *Val) const -> bool {
3077 if (auto *C = dyn_cast<Constant>(Val))
3078 return C->isZeroValue();
3079 return false;
3080}
3081
3082auto HexagonVectorCombine::getIntValue(const Value *Val) const
3083 -> std::optional<APInt> {
3084 if (auto *CI = dyn_cast<ConstantInt>(Val))
3085 return CI->getValue();
3086 return std::nullopt;
3087}
3088
3089auto HexagonVectorCombine::isUndef(const Value *Val) const -> bool {
3090 return isa<UndefValue>(Val);
3091}
3092
3093auto HexagonVectorCombine::isTrue(const Value *Val) const -> bool {
3094 return Val == ConstantInt::getTrue(Val->getType());
3095}
3096
3097auto HexagonVectorCombine::isFalse(const Value *Val) const -> bool {
3098 return isZero(Val);
3099}
3100
3101auto HexagonVectorCombine::getHvxTy(Type *ElemTy, bool Pair) const
3102 -> VectorType * {
3103 EVT ETy = EVT::getEVT(ElemTy, false);
3104 assert(ETy.isSimple() && "Invalid HVX element type");
3105 // Do not allow boolean types here: they don't have a fixed length.
3106 assert(HST.isHVXElementType(ETy.getSimpleVT(), /*IncludeBool=*/false) &&
3107 "Invalid HVX element type");
3108 unsigned HwLen = HST.getVectorLength();
3109 unsigned NumElems = (8 * HwLen) / ETy.getSizeInBits();
3110 return VectorType::get(ElemTy, Pair ? 2 * NumElems : NumElems,
3111 /*Scalable=*/false);
3112}
3113
3114auto HexagonVectorCombine::getSizeOf(const Value *Val, SizeKind Kind) const
3115 -> int {
3116 return getSizeOf(Val->getType(), Kind);
3117}
3118
3119auto HexagonVectorCombine::getSizeOf(const Type *Ty, SizeKind Kind) const
3120 -> int {
3121 auto *NcTy = const_cast<Type *>(Ty);
3122 switch (Kind) {
3123 case Store:
3124 return DL.getTypeStoreSize(NcTy).getFixedValue();
3125 case Alloc:
3126 return DL.getTypeAllocSize(NcTy).getFixedValue();
3127 }
3128 llvm_unreachable("Unhandled SizeKind enum");
3129}
3130
3131auto HexagonVectorCombine::getTypeAlignment(Type *Ty) const -> int {
3132 // The actual type may be shorter than the HVX vector, so determine
3133 // the alignment based on subtarget info.
3134 if (HST.isTypeForHVX(Ty))
3135 return HST.getVectorLength();
3136 return DL.getABITypeAlign(Ty).value();
3137}
3138
3139auto HexagonVectorCombine::length(Value *Val) const -> size_t {
3140 return length(Val->getType());
3141}
3142
3143auto HexagonVectorCombine::length(Type *Ty) const -> size_t {
3144 auto *VecTy = dyn_cast<VectorType>(Ty);
3145 assert(VecTy && "Must be a vector type");
3146 return VecTy->getElementCount().getFixedValue();
3147}
3148
3149auto HexagonVectorCombine::getNullValue(Type *Ty) const -> Constant * {
3151 auto Zero = ConstantInt::get(Ty->getScalarType(), 0);
3152 if (auto *VecTy = dyn_cast<VectorType>(Ty))
3153 return ConstantVector::getSplat(VecTy->getElementCount(), Zero);
3154 return Zero;
3155}
3156
3157auto HexagonVectorCombine::getFullValue(Type *Ty) const -> Constant * {
3159 auto Minus1 = ConstantInt::get(Ty->getScalarType(), -1);
3160 if (auto *VecTy = dyn_cast<VectorType>(Ty))
3161 return ConstantVector::getSplat(VecTy->getElementCount(), Minus1);
3162 return Minus1;
3163}
3164
3165auto HexagonVectorCombine::getConstSplat(Type *Ty, int Val) const
3166 -> Constant * {
3167 assert(Ty->isVectorTy());
3168 auto VecTy = cast<VectorType>(Ty);
3169 Type *ElemTy = VecTy->getElementType();
3170 // Add support for floats if needed.
3171 auto *Splat = ConstantVector::getSplat(VecTy->getElementCount(),
3172 ConstantInt::get(ElemTy, Val));
3173 return Splat;
3174}
3175
3176auto HexagonVectorCombine::simplify(Value *V) const -> Value * {
3177 if (auto *In = dyn_cast<Instruction>(V)) {
3178 SimplifyQuery Q(DL, &TLI, &DT, &AC, In);
3179 return simplifyInstruction(In, Q);
3180 }
3181 return nullptr;
3182}
3183
3184// Insert bytes [Start..Start+Length) of Src into Dst at byte Where.
3185auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,
3186 Value *Src, int Start, int Length,
3187 int Where) const -> Value * {
3188 assert(isByteVecTy(Dst->getType()) && isByteVecTy(Src->getType()));
3189 int SrcLen = getSizeOf(Src);
3190 int DstLen = getSizeOf(Dst);
3191 assert(0 <= Start && Start + Length <= SrcLen);
3192 assert(0 <= Where && Where + Length <= DstLen);
3193
3194 int P2Len = PowerOf2Ceil(SrcLen | DstLen);
3195 auto *Poison = PoisonValue::get(getByteTy());
3196 Value *P2Src = vresize(Builder, Src, P2Len, Poison);
3197 Value *P2Dst = vresize(Builder, Dst, P2Len, Poison);
3198
3199 SmallVector<int, 256> SMask(P2Len);
3200 for (int i = 0; i != P2Len; ++i) {
3201 // If i is in [Where, Where+Length), pick Src[Start+(i-Where)].
3202 // Otherwise, pick Dst[i];
3203 SMask[i] =
3204 (Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
3205 }
3206
3207 Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask, "shf");
3208 return vresize(Builder, P2Insert, DstLen, Poison);
3209}
3210
3211auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,
3212 Value *Hi, Value *Amt) const -> Value * {
3213 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3214 if (isZero(Amt))
3215 return Hi;
3216 int VecLen = getSizeOf(Hi);
3217 if (auto IntAmt = getIntValue(Amt))
3218 return getElementRange(Builder, Lo, Hi, VecLen - IntAmt->getSExtValue(),
3219 VecLen);
3220
3221 if (HST.isTypeForHVX(Hi->getType())) {
3222 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3223 "Expecting an exact HVX type");
3224 return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_vlalignb),
3225 Hi->getType(), {Hi, Lo, Amt});
3226 }
3227
3228 if (VecLen == 4) {
3229 Value *Pair = concat(Builder, {Lo, Hi});
3230 Value *Shift =
3231 Builder.CreateLShr(Builder.CreateShl(Pair, Amt, "shl"), 32, "lsr");
3232 Value *Trunc =
3233 Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
3234 return Builder.CreateBitCast(Trunc, Hi->getType(), "cst");
3235 }
3236 if (VecLen == 8) {
3237 Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt, "sub");
3238 return vralignb(Builder, Lo, Hi, Sub);
3239 }
3240 llvm_unreachable("Unexpected vector length");
3241}
3242
3243auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
3244 Value *Hi, Value *Amt) const -> Value * {
3245 assert(Lo->getType() == Hi->getType() && "Argument type mismatch");
3246 if (isZero(Amt))
3247 return Lo;
3248 int VecLen = getSizeOf(Lo);
3249 if (auto IntAmt = getIntValue(Amt))
3250 return getElementRange(Builder, Lo, Hi, IntAmt->getSExtValue(), VecLen);
3251
3252 if (HST.isTypeForHVX(Lo->getType())) {
3253 assert(static_cast<unsigned>(VecLen) == HST.getVectorLength() &&
3254 "Expecting an exact HVX type");
3255 return createHvxIntrinsic(Builder, HST.getIntrinsicId(Hexagon::V6_valignb),
3256 Lo->getType(), {Hi, Lo, Amt});
3257 }
3258
3259 if (VecLen == 4) {
3260 Value *Pair = concat(Builder, {Lo, Hi});
3261 Value *Shift = Builder.CreateLShr(Pair, Amt, "lsr");
3262 Value *Trunc =
3263 Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
3264 return Builder.CreateBitCast(Trunc, Lo->getType(), "cst");
3265 }
3266 if (VecLen == 8) {
3267 Type *Int64Ty = Type::getInt64Ty(F.getContext());
3268 Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst");
3269 Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst");
3270 Value *Call = Builder.CreateIntrinsic(Intrinsic::hexagon_S2_valignrb,
3271 {Hi64, Lo64, Amt},
3272 /*FMFSource=*/nullptr, "cup");
3273 return Builder.CreateBitCast(Call, Lo->getType(), "cst");
3274 }
3275 llvm_unreachable("Unexpected vector length");
3276}
3277
3278// Concatenates a sequence of vectors of the same type.
3279auto HexagonVectorCombine::concat(IRBuilderBase &Builder,
3280 ArrayRef<Value *> Vecs) const -> Value * {
3281 assert(!Vecs.empty());
3283 std::vector<Value *> Work[2];
3284 int ThisW = 0, OtherW = 1;
3285
3286 Work[ThisW].assign(Vecs.begin(), Vecs.end());
3287 while (Work[ThisW].size() > 1) {
3288 auto *Ty = cast<VectorType>(Work[ThisW].front()->getType());
3289 SMask.resize(length(Ty) * 2);
3290 std::iota(SMask.begin(), SMask.end(), 0);
3291
3292 Work[OtherW].clear();
3293 if (Work[ThisW].size() % 2 != 0)
3294 Work[ThisW].push_back(UndefValue::get(Ty));
3295 for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
3296 Value *Joined = Builder.CreateShuffleVector(
3297 Work[ThisW][i], Work[ThisW][i + 1], SMask, "shf");
3298 Work[OtherW].push_back(Joined);
3299 }
3300 std::swap(ThisW, OtherW);
3301 }
3302
3303 // Since there may have been some undefs appended to make shuffle operands
3304 // have the same type, perform the last shuffle to only pick the original
3305 // elements.
3306 SMask.resize(Vecs.size() * length(Vecs.front()->getType()));
3307 std::iota(SMask.begin(), SMask.end(), 0);
3308 Value *Total = Work[ThisW].front();
3309 return Builder.CreateShuffleVector(Total, SMask, "shf");
3310}
3311
3312auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
3313 int NewSize, Value *Pad) const -> Value * {
3315 auto *ValTy = cast<VectorType>(Val->getType());
3316 assert(ValTy->getElementType() == Pad->getType());
3317
3318 int CurSize = length(ValTy);
3319 if (CurSize == NewSize)
3320 return Val;
3321 // Truncate?
3322 if (CurSize > NewSize)
3323 return getElementRange(Builder, Val, /*Ignored*/ Val, 0, NewSize);
3324 // Extend.
3325 SmallVector<int, 128> SMask(NewSize);
3326 std::iota(SMask.begin(), SMask.begin() + CurSize, 0);
3327 std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);
3328 Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad, "spt");
3329 return Builder.CreateShuffleVector(Val, PadVec, SMask, "shf");
3330}
3331
3332auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
3333 Type *FromTy, Type *ToTy) const -> Value * {
3334 // Mask is a vector <N x i1>, where each element corresponds to an
3335 // element of FromTy. Remap it so that each element will correspond
3336 // to an element of ToTy.
3337 assert(isa<VectorType>(Mask->getType()));
3338
3339 Type *FromSTy = FromTy->getScalarType();
3340 Type *ToSTy = ToTy->getScalarType();
3341 if (FromSTy == ToSTy)
3342 return Mask;
3343
3344 int FromSize = getSizeOf(FromSTy);
3345 int ToSize = getSizeOf(ToSTy);
3346 assert(FromSize % ToSize == 0 || ToSize % FromSize == 0);
3347
3348 auto *MaskTy = cast<VectorType>(Mask->getType());
3349 int FromCount = length(MaskTy);
3350 int ToCount = (FromCount * FromSize) / ToSize;
3351 assert((FromCount * FromSize) % ToSize == 0);
3352
3353 auto *FromITy = getIntTy(FromSize * 8);
3354 auto *ToITy = getIntTy(ToSize * 8);
3355
3356 // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
3357 // -> trunc to <M x i1>.
3358 Value *Ext = Builder.CreateSExt(
3359 Mask, VectorType::get(FromITy, FromCount, /*Scalable=*/false), "sxt");
3360 Value *Cast = Builder.CreateBitCast(
3361 Ext, VectorType::get(ToITy, ToCount, /*Scalable=*/false), "cst");
3362 return Builder.CreateTrunc(
3363 Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable=*/false), "trn");
3364}
3365
3366// Bitcast to bytes, and return least significant bits.
3367auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const
3368 -> Value * {
3369 Type *ScalarTy = Val->getType()->getScalarType();
3370 if (ScalarTy == getBoolTy())
3371 return Val;
3372
3373 Value *Bytes = vbytes(Builder, Val);
3374 if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))
3375 return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)), "trn");
3376 // If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
3377 // <1 x i1>.
3378 return Builder.CreateTrunc(Bytes, getBoolTy(), "trn");
3379}
3380
3381// Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
3382auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const
3383 -> Value * {
3384 Type *ScalarTy = Val->getType()->getScalarType();
3385 if (ScalarTy == getByteTy())
3386 return Val;
3387
3388 if (ScalarTy != getBoolTy())
3389 return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)), "cst");
3390 // For bool, return a sext from i1 to i8.
3391 if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))
3392 return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy), "sxt");
3393 return Builder.CreateSExt(Val, getByteTy(), "sxt");
3394}
3395
3396auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,
3397 unsigned Start, unsigned Length) const
3398 -> Value * {
3399 assert(Start + Length <= length(Val));
3400 return getElementRange(Builder, Val, /*Ignored*/ Val, Start, Length);
3401}
3402
3403auto HexagonVectorCombine::sublo(IRBuilderBase &Builder, Value *Val) const
3404 -> Value * {
3405 size_t Len = length(Val);
3406 assert(Len % 2 == 0 && "Length should be even");
3407 return subvector(Builder, Val, 0, Len / 2);
3408}
3409
3410auto HexagonVectorCombine::subhi(IRBuilderBase &Builder, Value *Val) const
3411 -> Value * {
3412 size_t Len = length(Val);
3413 assert(Len % 2 == 0 && "Length should be even");
3414 return subvector(Builder, Val, Len / 2, Len / 2);
3415}
3416
3417auto HexagonVectorCombine::vdeal(IRBuilderBase &Builder, Value *Val0,
3418 Value *Val1) const -> Value * {
3419 assert(Val0->getType() == Val1->getType());
3420 int Len = length(Val0);
3421 SmallVector<int, 128> Mask(2 * Len);
3422
3423 for (int i = 0; i != Len; ++i) {
3424 Mask[i] = 2 * i; // Even
3425 Mask[i + Len] = 2 * i + 1; // Odd
3426 }
3427 return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
3428}
3429
3430auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,
3431 Value *Val1) const -> Value * { //
3432 assert(Val0->getType() == Val1->getType());
3433 int Len = length(Val0);
3434 SmallVector<int, 128> Mask(2 * Len);
3435
3436 for (int i = 0; i != Len; ++i) {
3437 Mask[2 * i + 0] = i; // Val0
3438 Mask[2 * i + 1] = i + Len; // Val1
3439 }
3440 return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
3441}
3442
3443auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
3444 Intrinsic::ID IntID, Type *RetTy,
3445 ArrayRef<Value *> Args,
3446 ArrayRef<Type *> ArgTys,
3447 ArrayRef<Value *> MDSources) const
3448 -> Value * {
3449 auto getCast = [&](IRBuilderBase &Builder, Value *Val,
3450 Type *DestTy) -> Value * {
3451 Type *SrcTy = Val->getType();
3452 if (SrcTy == DestTy)
3453 return Val;
3454
3455 // Non-HVX type. It should be a scalar, and it should already have
3456 // a valid type.
3457 assert(HST.isTypeForHVX(SrcTy, /*IncludeBool=*/true));
3458
3459 Type *BoolTy = Type::getInt1Ty(F.getContext());
3460 if (cast<VectorType>(SrcTy)->getElementType() != BoolTy)
3461 return Builder.CreateBitCast(Val, DestTy, "cst");
3462
3463 // Predicate HVX vector.
3464 unsigned HwLen = HST.getVectorLength();
3465 Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
3466 : Intrinsic::hexagon_V6_pred_typecast_128B;
3467 return Builder.CreateIntrinsic(TC, {DestTy, Val->getType()}, {Val},
3468 /*FMFSource=*/nullptr, "cup");
3469 };
3470
3471 Function *IntrFn =
3472 Intrinsic::getOrInsertDeclaration(F.getParent(), IntID, ArgTys);
3473 FunctionType *IntrTy = IntrFn->getFunctionType();
3474
3475 SmallVector<Value *, 4> IntrArgs;
3476 for (int i = 0, e = Args.size(); i != e; ++i) {
3477 Value *A = Args[i];
3478 Type *T = IntrTy->getParamType(i);
3479 if (A->getType() != T) {
3480 IntrArgs.push_back(getCast(Builder, A, T));
3481 } else {
3482 IntrArgs.push_back(A);
3483 }
3484 }
3485 StringRef MaybeName = !IntrTy->getReturnType()->isVoidTy() ? "cup" : "";
3486 CallInst *Call = Builder.CreateCall(IntrFn, IntrArgs, MaybeName);
3487
3488 MemoryEffects ME = Call->getAttributes().getMemoryEffects();
3490 propagateMetadata(Call, MDSources);
3491
3492 Type *CallTy = Call->getType();
3493 if (RetTy == nullptr || CallTy == RetTy)
3494 return Call;
3495 // Scalar types should have RetTy matching the call return type.
3496 assert(HST.isTypeForHVX(CallTy, /*IncludeBool=*/true));
3497 return getCast(Builder, Call, RetTy);
3498}
3499
3500auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,
3501 Value *Vec,
3502 unsigned ToWidth) const
3504 // Break a vector of wide elements into a series of vectors with narrow
3505 // elements:
3506 // (...c0:b0:a0, ...c1:b1:a1, ...c2:b2:a2, ...)
3507 // -->
3508 // (a0, a1, a2, ...) // lowest "ToWidth" bits
3509 // (b0, b1, b2, ...) // the next lowest...
3510 // (c0, c1, c2, ...) // ...
3511 // ...
3512 //
3513 // The number of elements in each resulting vector is the same as
3514 // in the original vector.
3515
3516 auto *VecTy = cast<VectorType>(Vec->getType());
3517 assert(VecTy->getElementType()->isIntegerTy());
3518 unsigned FromWidth = VecTy->getScalarSizeInBits();
3519 assert(isPowerOf2_32(ToWidth) && isPowerOf2_32(FromWidth));
3520 assert(ToWidth <= FromWidth && "Breaking up into wider elements?");
3521 unsigned NumResults = FromWidth / ToWidth;
3522
3523 SmallVector<Value *> Results(NumResults);
3524 Results[0] = Vec;
3525 unsigned Length = length(VecTy);
3526
3527 // Do it by splitting in half, since those operations correspond to deal
3528 // instructions.
3529 auto splitInHalf = [&](unsigned Begin, unsigned End, auto splitFunc) -> void {
3530 // Take V = Results[Begin], split it in L, H.
3531 // Store Results[Begin] = L, Results[(Begin+End)/2] = H
3532 // Call itself recursively split(Begin, Half), split(Half+1, End)
3533 if (Begin + 1 == End)
3534 return;
3535
3536 Value *Val = Results[Begin];
3537 unsigned Width = Val->getType()->getScalarSizeInBits();
3538
3539 auto *VTy = VectorType::get(getIntTy(Width / 2), 2 * Length, false);
3540 Value *VVal = Builder.CreateBitCast(Val, VTy, "cst");
3541
3542 Value *Res = vdeal(Builder, sublo(Builder, VVal), subhi(Builder, VVal));
3543
3544 unsigned Half = (Begin + End) / 2;
3545 Results[Begin] = sublo(Builder, Res);
3546 Results[Half] = subhi(Builder, Res);
3547
3548 splitFunc(Begin, Half, splitFunc);
3549 splitFunc(Half, End, splitFunc);
3550 };
3551
3552 splitInHalf(0, NumResults, splitInHalf);
3553 return Results;
3554}
3555
3556auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
3557 ArrayRef<Value *> Values,
3558 VectorType *ToType) const
3559 -> Value * {
3560 assert(ToType->getElementType()->isIntegerTy());
3561
3562 // If the list of values does not have power-of-2 elements, append copies
3563 // of the sign bit to it, to make the size be 2^n.
3564 // The reason for this is that the values will be joined in pairs, because
3565 // otherwise the shuffles will result in convoluted code. With pairwise
3566 // joins, the shuffles will hopefully be folded into a perfect shuffle.
3567 // The output will need to be sign-extended to a type with element width
3568 // being a power-of-2 anyways.
3569 SmallVector<Value *> Inputs(Values);
3570
3571 unsigned ToWidth = ToType->getScalarSizeInBits();
3572 unsigned Width = Inputs.front()->getType()->getScalarSizeInBits();
3573 assert(Width <= ToWidth);
3574 assert(isPowerOf2_32(Width) && isPowerOf2_32(ToWidth));
3575 unsigned Length = length(Inputs.front()->getType());
3576
3577 unsigned NeedInputs = ToWidth / Width;
3578 if (Inputs.size() != NeedInputs) {
3579 // Having too many inputs is ok: drop the high bits (usual wrap-around).
3580 // If there are too few, fill them with the sign bit.
3581 Value *Last = Inputs.back();
3582 Value *Sign = Builder.CreateAShr(
3583 Last, getConstSplat(Last->getType(), Width - 1), "asr");
3584 Inputs.resize(NeedInputs, Sign);
3585 }
3586
3587 while (Inputs.size() > 1) {
3588 Width *= 2;
3589 auto *VTy = VectorType::get(getIntTy(Width), Length, false);
3590 for (int i = 0, e = Inputs.size(); i < e; i += 2) {
3591 Value *Res = vshuff(Builder, Inputs[i], Inputs[i + 1]);
3592 Inputs[i / 2] = Builder.CreateBitCast(Res, VTy, "cst");
3593 }
3594 Inputs.resize(Inputs.size() / 2);
3595 }
3596
3597 assert(Inputs.front()->getType() == ToType);
3598 return Inputs.front();
3599}
3600
3601auto HexagonVectorCombine::calculatePointerDifference(Value *Ptr0,
3602 Value *Ptr1) const
3603 -> std::optional<int> {
3604 // Try SCEV first.
3605 const SCEV *Scev0 = SE.getSCEV(Ptr0);
3606 const SCEV *Scev1 = SE.getSCEV(Ptr1);
3607 const SCEV *ScevDiff = SE.getMinusSCEV(Scev0, Scev1);
3608 if (auto *Const = dyn_cast<SCEVConstant>(ScevDiff)) {
3609 APInt V = Const->getAPInt();
3610 if (V.isSignedIntN(8 * sizeof(int)))
3611 return static_cast<int>(V.getSExtValue());
3612 }
3613
3614 struct Builder : IRBuilder<> {
3615 Builder(BasicBlock *B) : IRBuilder<>(B->getTerminator()) {}
3616 ~Builder() {
3617 for (Instruction *I : llvm::reverse(ToErase))
3618 I->eraseFromParent();
3619 }
3620 SmallVector<Instruction *, 8> ToErase;
3621 };
3622
3623#define CallBuilder(B, F) \
3624 [&](auto &B_) { \
3625 Value *V = B_.F; \
3626 if (auto *I = dyn_cast<Instruction>(V)) \
3627 B_.ToErase.push_back(I); \
3628 return V; \
3629 }(B)
3630
3631 auto Simplify = [this](Value *V) {
3632 if (Value *S = simplify(V))
3633 return S;
3634 return V;
3635 };
3636
3637 auto StripBitCast = [](Value *V) {
3638 while (auto *C = dyn_cast<BitCastInst>(V))
3639 V = C->getOperand(0);
3640 return V;
3641 };
3642
3643 Ptr0 = StripBitCast(Ptr0);
3644 Ptr1 = StripBitCast(Ptr1);
3646 return std::nullopt;
3647
3648 auto *Gep0 = cast<GetElementPtrInst>(Ptr0);
3649 auto *Gep1 = cast<GetElementPtrInst>(Ptr1);
3650 if (Gep0->getPointerOperand() != Gep1->getPointerOperand())
3651 return std::nullopt;
3652 if (Gep0->getSourceElementType() != Gep1->getSourceElementType())
3653 return std::nullopt;
3654
3655 Builder B(Gep0->getParent());
3656 int Scale = getSizeOf(Gep0->getSourceElementType(), Alloc);
3657
3658 // FIXME: for now only check GEPs with a single index.
3659 if (Gep0->getNumOperands() != 2 || Gep1->getNumOperands() != 2)
3660 return std::nullopt;
3661
3662 Value *Idx0 = Gep0->getOperand(1);
3663 Value *Idx1 = Gep1->getOperand(1);
3664
3665 // First, try to simplify the subtraction directly.
3666 if (auto *Diff = dyn_cast<ConstantInt>(
3667 Simplify(CallBuilder(B, CreateSub(Idx0, Idx1)))))
3668 return Diff->getSExtValue() * Scale;
3669
3670 KnownBits Known0 = getKnownBits(Idx0, Gep0);
3671 KnownBits Known1 = getKnownBits(Idx1, Gep1);
3672 APInt Unknown = ~(Known0.Zero | Known0.One) | ~(Known1.Zero | Known1.One);
3673 if (Unknown.isAllOnes())
3674 return std::nullopt;
3675
3676 Value *MaskU = ConstantInt::get(Idx0->getType(), Unknown);
3677 Value *AndU0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskU)));
3678 Value *AndU1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskU)));
3679 Value *SubU = Simplify(CallBuilder(B, CreateSub(AndU0, AndU1)));
3680 int Diff0 = 0;
3681 if (auto *C = dyn_cast<ConstantInt>(SubU)) {
3682 Diff0 = C->getSExtValue();
3683 } else {
3684 return std::nullopt;
3685 }
3686
3687 Value *MaskK = ConstantInt::get(MaskU->getType(), ~Unknown);
3688 Value *AndK0 = Simplify(CallBuilder(B, CreateAnd(Idx0, MaskK)));
3689 Value *AndK1 = Simplify(CallBuilder(B, CreateAnd(Idx1, MaskK)));
3690 Value *SubK = Simplify(CallBuilder(B, CreateSub(AndK0, AndK1)));
3691 int Diff1 = 0;
3692 if (auto *C = dyn_cast<ConstantInt>(SubK)) {
3693 Diff1 = C->getSExtValue();
3694 } else {
3695 return std::nullopt;
3696 }
3697
3698 return (Diff0 + Diff1) * Scale;
3699
3700#undef CallBuilder
3701}
3702
3703auto HexagonVectorCombine::getNumSignificantBits(const Value *V,
3704 const Instruction *CtxI) const
3705 -> unsigned {
3706 return ComputeMaxSignificantBits(V, DL, &AC, CtxI, &DT);
3707}
3708
3709auto HexagonVectorCombine::getKnownBits(const Value *V,
3710 const Instruction *CtxI) const
3711 -> KnownBits {
3712 return computeKnownBits(V, DL, &AC, CtxI, &DT);
3713}
3714
3715auto HexagonVectorCombine::isSafeToClone(const Instruction &In) const -> bool {
3716 if (In.mayHaveSideEffects() || In.isAtomic() || In.isVolatile() ||
3717 In.isFenceLike() || In.mayReadOrWriteMemory()) {
3718 return false;
3719 }
3720 if (isa<CallBase>(In) || isa<AllocaInst>(In))
3721 return false;
3722 return true;
3723}
3724
3725template <typename T>
3726auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
3728 const T &IgnoreInsts) const
3729 -> bool {
3730 auto getLocOrNone =
3731 [this](const Instruction &I) -> std::optional<MemoryLocation> {
3732 if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
3733 switch (II->getIntrinsicID()) {
3734 case Intrinsic::masked_load:
3735 return MemoryLocation::getForArgument(II, 0, TLI);
3736 case Intrinsic::masked_store:
3737 return MemoryLocation::getForArgument(II, 1, TLI);
3738 }
3739 }
3741 };
3742
3743 // The source and the destination must be in the same basic block.
3744 const BasicBlock &Block = *In.getParent();
3745 assert(Block.begin() == To || Block.end() == To || To->getParent() == &Block);
3746 // No PHIs.
3747 if (isa<PHINode>(In) || (To != Block.end() && isa<PHINode>(*To)))
3748 return false;
3749
3751 return true;
3752 bool MayWrite = In.mayWriteToMemory();
3753 auto MaybeLoc = getLocOrNone(In);
3754
3755 auto From = In.getIterator();
3756 if (From == To)
3757 return true;
3758 bool MoveUp = (To != Block.end() && To->comesBefore(&In));
3759 auto Range =
3760 MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
3761 for (auto It = Range.first; It != Range.second; ++It) {
3762 const Instruction &I = *It;
3763 if (llvm::is_contained(IgnoreInsts, &I))
3764 continue;
3765 // assume intrinsic can be ignored
3766 if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
3767 if (II->getIntrinsicID() == Intrinsic::assume)
3768 continue;
3769 }
3770 // Parts based on isSafeToMoveBefore from CoveMoverUtils.cpp.
3771 if (I.mayThrow())
3772 return false;
3773 if (auto *CB = dyn_cast<CallBase>(&I)) {
3774 if (!CB->hasFnAttr(Attribute::WillReturn))
3775 return false;
3776 if (!CB->hasFnAttr(Attribute::NoSync))
3777 return false;
3778 }
3779 if (I.mayReadOrWriteMemory()) {
3780 auto MaybeLocI = getLocOrNone(I);
3781 if (MayWrite || I.mayWriteToMemory()) {
3782 if (!MaybeLoc || !MaybeLocI)
3783 return false;
3784 if (!AA.isNoAlias(*MaybeLoc, *MaybeLocI))
3785 return false;
3786 }
3787 }
3788 }
3789 return true;
3790}
3791
3792auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool {
3793 if (auto *VecTy = dyn_cast<VectorType>(Ty))
3794 return VecTy->getElementType() == getByteTy();
3795 return false;
3796}
3797
3798auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,
3799 Value *Hi, int Start,
3800 int Length) const -> Value * {
3801 assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));
3802 SmallVector<int, 128> SMask(Length);
3803 std::iota(SMask.begin(), SMask.end(), Start);
3804 return Builder.CreateShuffleVector(Lo, Hi, SMask, "shf");
3805}
3806
3807// Pass management.
3808
3809namespace {
3810class HexagonVectorCombineLegacy : public FunctionPass {
3811public:
3812 static char ID;
3813
3814 HexagonVectorCombineLegacy() : FunctionPass(ID) {}
3815
3816 StringRef getPassName() const override { return "Hexagon Vector Combine"; }
3817
3818 void getAnalysisUsage(AnalysisUsage &AU) const override {
3819 AU.setPreservesCFG();
3820 AU.addRequired<AAResultsWrapperPass>();
3821 AU.addRequired<AssumptionCacheTracker>();
3822 AU.addRequired<DominatorTreeWrapperPass>();
3823 AU.addRequired<ScalarEvolutionWrapperPass>();
3824 AU.addRequired<TargetLibraryInfoWrapperPass>();
3825 AU.addRequired<TargetPassConfig>();
3826 FunctionPass::getAnalysisUsage(AU);
3827 }
3828
3829 bool runOnFunction(Function &F) override {
3830 if (skipFunction(F))
3831 return false;
3832 AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
3833 AssumptionCache &AC =
3834 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
3835 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
3836 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
3837 TargetLibraryInfo &TLI =
3838 getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
3839 auto &TM = getAnalysis<TargetPassConfig>().getTM<HexagonTargetMachine>();
3840 HexagonVectorCombine HVC(F, AA, AC, DT, SE, TLI, TM);
3841 return HVC.run();
3842 }
3843};
3844} // namespace
3845
3846char HexagonVectorCombineLegacy::ID = 0;
3847
3848INITIALIZE_PASS_BEGIN(HexagonVectorCombineLegacy, DEBUG_TYPE,
3849 "Hexagon Vector Combine", false, false)
3856INITIALIZE_PASS_END(HexagonVectorCombineLegacy, DEBUG_TYPE,
3857 "Hexagon Vector Combine", false, false)
3858
3860 return new HexagonVectorCombineLegacy();
3861}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
AMDGPU Prepare AGPR Alloc
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
static IntegerType * getIntTy(IRBuilderBase &B, const TargetLibraryInfo *TLI)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file defines the DenseMap class.
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
hexagon bit simplify
static cl::opt< unsigned > SizeLimit("eif-limit", cl::init(6), cl::Hidden, cl::desc("Size limit in Hexagon early if-conversion"))
shuff Hexagon Optimize Shuffle Vector
static Value * locateIndexesFromIntrinsic(Instruction *In)
Instruction * locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Value * getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
static Value * locateIndexesFromGEP(Value *In)
#define CallBuilder(B, F)
Value * getPointer(Value *Ptr)
#define DEFAULT_HVX_VTCM_PAGE_SIZE
static Value * locateAddressFromIntrinsic(Instruction *In)
static Instruction * selectDestination(Instruction *In, HvxIdioms::DstQualifier &Qual)
Value * get_i32_Mask(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, unsigned int pattern)
bool isArithmetic(unsigned Opc)
static Type * getIndexType(Value *In)
GetElementPtrInst * locateGepFromIntrinsic(Instruction *In)
Value * getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC, IRBuilderBase &Builder, LLVMContext &Ctx, Value *I)
iv Induction Variable Users
Definition IVUsers.cpp:48
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
#define H(x, y, z)
Definition MD5.cpp:56
static bool isCandidate(const MachineInstr *MI, Register &DefedReg, Register FrameReg)
static bool isUndef(const MachineInstr &MI)
This file contains the declarations for metadata subclasses.
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static ConstantInt * getConstInt(MDNode *MD, unsigned NumOp)
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
Target-Independent Code Generator Pass Configuration Options pass.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:828
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
InstListType::const_iterator const_iterator
Definition BasicBlock.h:171
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
AttributeList getAttributes() const
Return the attributes for this call.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
static LLVM_ABI Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:136
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
iterator_range< iterator > children()
NodeT * getBlock() const
DomTreeNodeBase< NodeT > * getRootNode()
getRootNode - This returns the entry node for the CFG of the function.
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:321
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
const BasicBlock & back() const
Definition Function.h:860
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool isHVXVectorType(EVT VecTy, bool IncludeBool=false) const
unsigned getVectorLength() const
bool isTypeForHVX(Type *VecTy, bool IncludeBool=false) const
Intrinsic::ID getIntrinsicId(unsigned Opc) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition IRBuilder.h:1833
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2626
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2097
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2336
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2197
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2442
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
const char * getOpcodeName() const
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:220
bool onlyAccessesInaccessibleMem() const
Whether this function only (at most) accesses inaccessible memory.
Definition ModRef.h:239
static LLVM_ABI std::optional< MemoryLocation > getOrNone(const Instruction *Inst)
static LLVM_ABI MemoryLocation getForArgument(const CallBase *Call, unsigned ArgIdx, const TargetLibraryInfo *TLI)
Return a location representing a particular argument of a call.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
The main scalar evolution driver.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:246
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
Rounding
Possible values of current rounding mode, which is specified in bits 23:22 of FPCR.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate
Predicate - These are "(BI << 5) | BO" for various predicates.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
constexpr double e
@ User
could "use" a pointer
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI Instruction * getTerminator() const
LLVM_ABI Instruction & front() const
This is an optimization pass for GlobalISel generic memory operations.
FunctionPass * createHexagonVectorCombineLegacyPass()
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:301
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1789
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI Value * simplifyInstruction(Instruction *I, const SimplifyQuery &Q)
See if we can compute a simplified version of this instruction.
DomTreeNodeBase< BasicBlock > DomTreeNode
Definition Dominators.h:94
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2042
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
ArrayRef(const T &OneElt) -> ArrayRef< T >
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2132
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
MaskT vshuff(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
MaskT vdeal(ArrayRef< int > Vu, ArrayRef< int > Vv, unsigned Size, bool TakeOdd)
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316