LLVM  14.0.0git
StringView.h
Go to the documentation of this file.
1 //===--- StringView.h -------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // FIXME: Use std::string_view instead when we support C++17.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_DEMANGLE_STRINGVIEW_H
14 #define LLVM_DEMANGLE_STRINGVIEW_H
15 
16 #include "DemangleConfig.h"
17 #include <algorithm>
18 #include <cassert>
19 #include <cstring>
20 
22 
23 class StringView {
24  const char *First;
25  const char *Last;
26 
27 public:
28  static const size_t npos = ~size_t(0);
29 
30  template <size_t N>
31  StringView(const char (&Str)[N]) : First(Str), Last(Str + N - 1) {}
32  StringView(const char *First_, const char *Last_)
33  : First(First_), Last(Last_) {}
34  StringView(const char *First_, size_t Len)
35  : First(First_), Last(First_ + Len) {}
36  StringView(const char *Str) : First(Str), Last(Str + std::strlen(Str)) {}
37  StringView() : First(nullptr), Last(nullptr) {}
38 
39  StringView substr(size_t Pos, size_t Len = npos) const {
40  assert(Pos <= size());
41  return StringView(begin() + Pos, std::min(Len, size() - Pos));
42  }
43 
44  size_t find(char C, size_t From = 0) const {
45  size_t FindBegin = std::min(From, size());
46  // Avoid calling memchr with nullptr.
47  if (FindBegin < size()) {
48  // Just forward to memchr, which is faster than a hand-rolled loop.
49  if (const void *P = ::memchr(First + FindBegin, C, size() - FindBegin))
50  return size_t(static_cast<const char *>(P) - First);
51  }
52  return npos;
53  }
54 
55  StringView dropFront(size_t N = 1) const {
56  if (N >= size())
57  N = size();
58  return StringView(First + N, Last);
59  }
60 
61  StringView dropBack(size_t N = 1) const {
62  if (N >= size())
63  N = size();
64  return StringView(First, Last - N);
65  }
66 
67  char front() const {
68  assert(!empty());
69  return *begin();
70  }
71 
72  char back() const {
73  assert(!empty());
74  return *(end() - 1);
75  }
76 
77  char popFront() {
78  assert(!empty());
79  return *First++;
80  }
81 
82  bool consumeFront(char C) {
83  if (!startsWith(C))
84  return false;
85  *this = dropFront(1);
86  return true;
87  }
88 
90  if (!startsWith(S))
91  return false;
92  *this = dropFront(S.size());
93  return true;
94  }
95 
96  bool startsWith(char C) const { return !empty() && *begin() == C; }
97 
98  bool startsWith(StringView Str) const {
99  if (Str.size() > size())
100  return false;
101  return std::equal(Str.begin(), Str.end(), begin());
102  }
103 
104  const char &operator[](size_t Idx) const { return *(begin() + Idx); }
105 
106  const char *begin() const { return First; }
107  const char *end() const { return Last; }
108  size_t size() const { return static_cast<size_t>(Last - First); }
109  bool empty() const { return First == Last; }
110 };
111 
112 inline bool operator==(const StringView &LHS, const StringView &RHS) {
113  return LHS.size() == RHS.size() &&
114  std::equal(LHS.begin(), LHS.end(), RHS.begin());
115 }
116 
118 
119 #endif
z
return z
Definition: README.txt:14
i
i
Definition: README.txt:29
xmm2
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps xmm2
Definition: README.txt:1120
algorithm
Improvements to the multiply shift add algorithm
Definition: README.txt:10
LCPI1_1
cond_true lis lo16() lo16() lo16() LCPI1_1(r2) fsub f0
xmm1
esp eax movl ecx ecx cvtsi2ss xmm0 eax cvtsi2ss xmm1 xmm0 addss xmm1
Definition: README.txt:305
L5
to esp esp setne al movzbw ax esp setg cl movzbw cx cmove cx cl jne LBB1_2 esp which is much esp edx eax decl edx jle L7 L5
Definition: README.txt:656
set
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 atomic and others It is also currently not done for read modify write instructions It is also current not done if the OF or CF flags are needed The shift operators have the complication that when the shift count is EFLAGS is not set
Definition: README.txt:1277
f0
We f0
Definition: README.txt:76
CmpMode::FP
@ FP
zeros
zeros_impl< sizeof(T)> zeros(const T &)
Definition: COFFEmitter.cpp:345
noreg
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 noreg
Definition: README.txt:36
behavior
should just be implemented with a CLZ instruction Since there are other e that share this behavior
Definition: README.txt:709
ABI
Generic address nodes are lowered to some combination of target independent and machine specific ABI
Definition: Relocation.txt:34
is
should just be implemented with a CLZ instruction Since there are other e that share this it would be best to implement this in a target independent as zero is the default value for the binary encoder e add r0 add r5 Register operands should be distinct That is
Definition: README.txt:725
as
compiles conv shl5 shl ret i32 or10 it would be better as
Definition: README.txt:615
ecx
Instead of the following for memset char edx edx edx It might be better to generate eax movl edx movl edx movw edx when we can spare a register It reduces code size Evaluate what the best way to codegen sdiv C is For we currently get ret i32 Y eax movl ecx ecx ecx addl ecx
Definition: README.txt:147
produces
entry stw r5 blr GCC produces
Definition: README.txt:174
places
Itanium Name Demangler i e convert the string _Z1fv into but neither can depend on each other libcxxabi needs the demangler to implement which is part of the itanium ABI spec LLVM needs a copy for a bunch of places
Definition: README.txt:20
then
RUN< i32 >< i1 > br i1 label then
Definition: README.txt:338
targets
should just be implemented with a CLZ instruction Since there are other targets
Definition: README.txt:709
StringView::StringView
StringView()
Definition: StringView.h:37
right
the custom lowered code happens to be right
Definition: README-SSE.txt:480
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4636
readnone
We generate the following IR with i32 b nounwind readnone
Definition: README.txt:1443
tmp553554
< i32 > tmp553554
Definition: README.txt:560
libcalls
partially inline libcalls
Definition: PartiallyInlineLibCalls.cpp:208
comparisons
The same transformation can work with an even modulo with the addition of a and shrink the compare RHS by the same amount Unless the target supports that transformation probably isn t worthwhile The transformation can also easily be made to work with non zero equality comparisons
Definition: README.txt:685
GetHotKey
THotKey GetHotKey()
Definition: README.txt:470
constant
we should consider alternate ways to model stack dependencies Lots of things could be done in WebAssemblyTargetTransformInfo cpp there are numerous optimization related hooks that can be overridden in WebAssemblyTargetLowering Instead of the OptimizeReturned which should consider preserving the returned attribute through to MachineInstrs and extending the MemIntrinsicResults pass to do this optimization on calls too That would also let the WebAssemblyPeephole pass clean up dead defs for such as it does for stores Consider implementing and or getMachineCombinerPatterns Find a clean way to fix the problem which leads to the Shrink Wrapping pass being run after the WebAssembly PEI pass When setting multiple variables to the same constant
Definition: README.txt:91
requirement
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n y store obj obj **nth_el If the i64 division is lowered to a then a safe point array and nth_el no longer point into the correct object The fix for this is to copy address calculations so that dependent pointers are never live across safe point boundaries But the loads cannot be copied like this if there was an intervening so may be hard to get right Only a concurrent mutator can trigger a collection at the libcall safe point So single threaded programs do not have this requirement
Definition: README.txt:136
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
rcx
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq rsp movq rsp movq rcx
Definition: README.txt:1125
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
eax
Add support for conditional and other related patterns Instead eax eax je LBB16_2 eax edi eax movl eax
Definition: README.txt:145
c2
This might compile to this xmm1 xorps xmm0 movss xmm0 ret Now consider if the code caused xmm1 to get spilled This might produce this xmm1 movaps c2(%esp) ... xorps %xmm0
llvm::ARM_AM::lsr
@ lsr
Definition: ARMAddressingModes.h:31
subl
subl
Definition: README.txt:297
v2i32
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq rsp movq rsp movq rsp movq rsp movq rsp rax movq rsp rax movq rsp rsp rsp eax eax jbe LBB1_3 rcx rax movq rsp eax rsp ret ecx eax rcx movl rsp jmp LBB1_2 gcc rsp rax movq rsp rsp movq rsp rax movq rsp eax eax jb L6 rdx eax rsp ret p2align edx rdx eax movl rsp eax rsp ret and it gets compiled into this on ebp esp eax movl ebp eax movl ebp eax esp popl ebp ret gcc ebp eax popl ebp ret Teach tblgen not to check bitconvert source type in some cases This allows us to consolidate the following patterns in X86InstrMMX v2i32(MMX_MOVDQ2Qrr VR128:$src))>
LLVM
Improvements to the multiply shift add e g in LLVM
Definition: README.txt:11
StringView::StringView
StringView(const char *First_, size_t Len)
Definition: StringView.h:34
use
Should compile r2 movcc movcs str strb mov lr r1 movcs movcc mov lr r1 str mov mov cmp r1 movlo r2 str bx lr r0 mov mov cmp r0 movhs r2 mov r1 bx lr Some of the NEON intrinsics may be appropriate for more general use
Definition: README.txt:484
cmpl
Add support for conditional and other related patterns Instead eax cmpl
Definition: README.txt:135
zero
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 atomic and others It is also currently not done for read modify write instructions It is also current not done if the OF or CF flags are needed The shift operators have the complication that when the shift count is zero
Definition: README.txt:1277
StringView::operator[]
const char & operator[](size_t Idx) const
Definition: StringView.h:104
simplify
hexagon bit simplify
Definition: HexagonBitSimplify.cpp:261
ceil
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g ceil
Definition: README-FPStack.txt:54
ldmia
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a ldmia
Definition: README.txt:204
StringView::size
size_t size() const
Definition: StringView.h:108
see
The object format emitted by the WebAssembly backed is documented see the home and packaging for producing WebAssembly applications that can run in browsers and other environments wasi sdk provides a more minimal C C SDK based on llvm and a libc based on for producing WebAssemmbly applictions that use the WASI ABI Rust provides WebAssembly support integrated into Cargo There are two main which provides a relatively minimal environment that has an emphasis on being native wasm32 unknown which uses Emscripten internally and provides standard C C filesystem GL and SDL bindings For more see
Definition: README.txt:41
Similarly
we should consider alternate ways to model stack dependencies Lots of things could be done in WebAssemblyTargetTransformInfo cpp Similarly
Definition: README.txt:67
return
return
Definition: README.txt:242
tmp6
< i32 > tmp6
Definition: README.txt:1446
Note
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles Note
Definition: README.txt:239
ManglingParser
Definition: ItaniumDemangle.h:5730
tmp2
< i1 > br i1 tmp2
Definition: README.txt:976
xmm7
gets compiled into this on rsp movaps xmm7
Definition: README.txt:1115
pointer
Replace within non kernel function use of LDS with pointer
Definition: AMDGPUReplaceLDSUseWithPointer.cpp:443
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
sroa
sroa
Definition: SROA.cpp:4881
fix
arm execution domain fix
Definition: ARMTargetMachine.cpp:391
change
< i32 > br label bb114 eax ecx movl ebp subl ebp eax movl ebp subl ebp eax movl ebp subl ebp eax subl ecx movl ebp eax movl ebp eax movl ebp ebp This appears to be bad because the RA is not folding the store to the stack slot into the movl The above instructions could ebp ebp This seems like a cross between remat and spill folding This has redundant subtractions of eax from a stack slot ecx doesn t change
Definition: README.txt:599
xorb
Should compile edi setae al movzbl eax ret on instead of the rather stupid edi setb al xorb
Definition: README.txt:545
ops
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 atomic ops
Definition: README.txt:1272
include
include(LLVM-Build) add_subdirectory(IR) add_subdirectory(FuzzMutate) add_subdirectory(FileCheck) add_subdirectory(InterfaceStub) add_subdirectory(IRReader) add_subdirectory(CodeGen) add_subdirectory(BinaryFormat) add_subdirectory(Bitcode) add_subdirectory(Bitstream) add_subdirectory(DWARFLinker) add_subdirectory(Extensions) add_subdirectory(Frontend) add_subdirectory(Transforms) add_subdirectory(Linker) add_subdirectory(Analysis) add_subdirectory(LTO) add_subdirectory(MC) add_subdirectory(MCA) add_subdirectory(Object) add_subdirectory(ObjectYAML) add_subdirectory(Option) add_subdirectory(Remarks) add_subdirectory(DebugInfo) add_subdirectory(DWP) add_subdirectory(ExecutionEngine) add_subdirectory(Target) add_subdirectory(AsmParser) add_subdirectory(LineEditor) add_subdirectory(ProfileData) add_subdirectory(Passes) add_subdirectory(TextAPI) add_subdirectory(ToolDrivers) add_subdirectory(XRay) if(LLVM_INCLUDE_TESTS) add_subdirectory(Testing) endif() add_subdirectory(WindowsManifest) set(LLVMCONFIGLIBRARYDEPENDENCIESINC "$
Definition: CMakeLists.txt:1
stuff
_bar mov r0 mov r1 fldd LCPI1_0 fmrrd d0 bl _foo fmdrr r5 fmsr r0 fsitod s2 faddd d0 fmrrd d0 ldmfd r0 mov r1 the copys to callee save registers and the fact they are only being used by the fmdrr instruction It would have been better had the fmdrr been scheduled before the call and place the result in a callee save DPR register The two mov ops would not have been necessary Calling convention related stuff
Definition: README.txt:181
minimum
Should compile r2 movcc movcs str strb mov lr r1 movcs movcc mov lr r1 str mov mov cmp r1 movlo r2 str bx lr r0 mov mov cmp r0 movhs r2 mov r1 bx lr Some of the NEON intrinsics may be appropriate for more general either as target independent intrinsics or perhaps elsewhere in the ARM backend Some of them may also be lowered to target independent and perhaps some new SDNodes could be added For minimum
Definition: README.txt:489
store
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n y store obj obj **nth_el If the i64 division is lowered to a then a safe point array and nth_el no longer point into the correct object The fix for this is to copy address calculations so that dependent pointers are never live across safe point boundaries But the loads cannot be copied like this if there was an intervening store
Definition: README.txt:133
contains
return AArch64::GPR64RegClass contains(Reg)
br_if
The object format emitted by the WebAssembly backed is documented see the home and packaging for producing WebAssembly applications that can run in browsers and other environments wasi sdk provides a more minimal C C SDK based on llvm and a libc based on for producing WebAssemmbly applictions that use the WASI ABI Rust provides WebAssembly support integrated into Cargo There are two main which provides a relatively minimal environment that has an emphasis on being native wasm32 unknown which uses Emscripten internally and provides standard C C filesystem GL and SDL bindings For more br_if
Definition: README.txt:41
double
into xmm2 addss xmm2 xmm1 xmm3 addss xmm3 movaps xmm0 unpcklps xmm0 ret seems silly when it could just be one addps Expand libm rounding functions main should enable SSE DAZ mode and other fast SSE modes Think about doing i64 math in SSE regs on x86 This testcase should have no SSE instructions in and only one load from a constant double
Definition: README-SSE.txt:85
v2i64
QP Compare Ordered outs ins xscmpudp No builtin are required Or llvm fcmp order unorder compare DP QP Compare builtin are required DP xscmp *dp write to VSX register Use int_ppc_vsx_xscmpeqdp int_ppc_vsx_xscmpgedp int_ppc_vsx_xscmpgtdp int_ppc_vsx_xscmpnedp v2i64
Definition: README_P9.txt:323
uses
This might compile to this xmm1 xorps xmm0 movss xmm0 ret Now consider if the code caused xmm1 to get spilled This might produce this xmm1 movaps xmm0 movaps xmm1 movss xmm0 ret since the reload is only used by these we could fold it into the uses
Definition: README-SSE.txt:258
output
Current output
Definition: README.txt:1350
llvm::tgtok::Code
@ Code
Definition: TGLexer.h:50
llvm::AMDGPU::IsaInfo::TargetIDSetting::On
@ On
codegen
Since we know that Vector is byte aligned and we know the element offset of we should change the load into a lve *x instead of doing a load store lve *x sequence Implement passing vectors by value into calls and receiving them as arguments GCC apparently tries to codegen
Definition: README_ALTIVEC.txt:46
llvm::PatternMatch::m_Add
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
Definition: PatternMatch.h:1008
away
AMD64 Optimization Manual has some nice information about optimizing integer multiplication by a constant How much of it applies to Intel s X86 implementation There are definite trade offs to xmm0 cvttss2siq rdx jb L3 subss xmm0 rax cvttss2siq rdx xorq rdx rax ret instead of xmm1 cvttss2siq rcx movaps xmm2 subss xmm2 cvttss2siq rax rdx xorq rax ucomiss xmm0 cmovb rax ret Seems like the jb branch has high likelihood of being taken It would have saved a few instructions It s not possible to reference and DH registers in an instruction requiring REX prefix divb and mulb both produce results in AH If isel emits a CopyFromReg which gets turned into a movb and that can be allocated a r8b r15b To get around isel emits a CopyFromReg from AX and then right shift it down by and truncate it It s not pretty but it works We need some register allocation magic to make the hack go away(e.g. putting additional constraints on the result of the movb). The x86-64 ABI for hidden-argument struct returns requires that the incoming value of %rdi be copied into %rax by the callee upon return. The idea is that it saves callers from having to remember this value
tools
The object format emitted by the WebAssembly backed is documented see the home tools
Definition: README.txt:16
simpler
entry mr r2 blr This could be reduced to the much simpler
Definition: README.txt:210
tmp623
< i32 > tmp623
Definition: README.txt:565
_test1
Instead of the following for memset char edx edx edx It might be better to generate eax movl edx movl edx movw edx when we can spare a register It reduces code size Evaluate what the best way to codegen sdiv C is For we currently get ret i32 Y _test1
Definition: README.txt:143
information
The object format emitted by the WebAssembly backed is documented see the home and packaging for producing WebAssembly applications that can run in browsers and other environments wasi sdk provides a more minimal C C SDK based on llvm and a libc based on for producing WebAssemmbly applictions that use the WASI ABI Rust provides WebAssembly support integrated into Cargo There are two main which provides a relatively minimal environment that has an emphasis on being native wasm32 unknown which uses Emscripten internally and provides standard C C filesystem GL and SDL bindings For more information
Definition: README.txt:29
tmp5
< float > tmp5
Definition: README-SSE.txt:794
nth_el
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr nth_el
Definition: README.txt:122
movzbl
_test eax xmm0 eax xmm1 comiss xmm1 setae al movzbl ecx eax edx ecx cmove eax ret Note the movzbl
Definition: README.txt:210
pool
Implement PPCInstrInfo::isLoadFromStackSlot isStoreToStackSlot for vector to generate better spill code The first should be a single lvx from the constant pool
Definition: README_ALTIVEC.txt:8
movsd
For the entry BB esp pxor xmm0 movsd(%esp)
test::unit
unit
Definition: README.txt:46
however
however
Definition: README.txt:253
stores
hexagon widen stores
Definition: HexagonStoreWidening.cpp:118
Library
Itanium Name Demangler Library
Definition: README.txt:2
integers
into llvm powi allowing the code generator to produce balanced multiplication trees the intrinsic needs to be extended to support integers
Definition: README.txt:54
to
Should compile to
Definition: README.txt:449
Example
it should cost the same as a move shift on any modern but it s a lot shorter Downside is that it puts more pressure on register allocation because it has fixed operands Example
Definition: README.txt:928
xorl
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra but it s one instruction and doesn t stress bit subreg eax eax movl edx xorl
Definition: README.txt:37
Shift
bool Shift
Definition: README.txt:468
cr7
Should compile to something r4 addze r3 instead we r3 cmplw cr7
Definition: README.txt:25
hypot
The legalization code for mul with overflow needs to be made more robust before this can be implemented though Get the C front end to expand hypot(x, y) -> llvm.sqrt(x *x+y *y) when errno and precision don 't matter(ffastmath). Misc/mandel will like this. :) This isn 't safe in general, even on darwin. See the libm implementation of hypot for examples(which special case when x/y are exactly zero to get signed zeros etc right). On targets with expensive 64-bit multiply, we could LSR this:for(i=...
calls
into llvm powi calls
Definition: README.txt:51
llvm::reverse
auto reverse(ContainerTy &&C, std::enable_if_t< has_rbegin< ContainerTy >::value > *=nullptr)
Definition: STLExtras.h:329
ip
mov ip
Definition: README.txt:214
Analysis
block Block Frequency Analysis
Definition: BlockFrequencyInfo.cpp:302
ha16
We ha16(.CPI_X_0) lfd f0
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:233
llvm::remarks::Type::Missed
@ Missed
i8
Clang compiles this i8
Definition: README.txt:504
cpy
void cpy(struct foo *a, struct foo *b)
Definition: README.txt:122
PIC
PassInstrumentationCallbacks PIC
Definition: PassBuilderBindings.cpp:55
llvm::MipsISD::SDL
@ SDL
Definition: MipsISelLowering.h:251
shrl
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra but it s one instruction and doesn t stress bit subreg eax shrl
Definition: README.txt:35
llvm::AArch64ISD::CALL
@ CALL
Definition: AArch64ISelLowering.h:52
know
the custom lowered code happens to be but we shouldn t have to custom lower anything This is probably related to< 2 x i64 > ops being so bad LLVM currently generates stack realignment when it is not necessary needed The problem is that we need to know about stack alignment too before RA runs At that point we don t know
Definition: README-SSE.txt:489
StringView::StringView
StringView(const char(&Str)[N])
Definition: StringView.h:31
slot
Since we know that Vector is byte aligned and we know the element offset of we should change the load into a lve *x instead of doing a load store lve *x sequence Implement passing vectors by value into calls and receiving them as arguments GCC apparently tries to then a load and vperm of Variable We need a way to teach tblgen that some operands of an intrinsic are required to be constants The verifier should enforce this constraint We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a byte aligned stack slot
Definition: README_ALTIVEC.txt:57
llvm::WinEH::EncodingType::Itanium
@ Itanium
Windows CE ARM, PowerPC, SH3, SH4.
copies
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n y store obj obj **nth_el If the i64 division is lowered to a then a safe point array and nth_el no longer point into the correct object The fix for this is to copy address calculations so that dependent pointers are never live across safe point boundaries But the loads cannot be copied like this if there was an intervening so may be hard to get right Only a concurrent mutator can trigger a collection at the libcall safe point So single threaded programs do not have this even with a copying collector LLVM optimizations would probably undo a front end s careful work The ocaml frametable structure supports liveness information It would be good to support it The FIXME in ComputeCommonTailLength in BranchFolding cpp needs to be revisited The check is there to work around a misuse of directives in assembly It would be good to detect collector target compatibility instead of silently doing the wrong thing It would be really nice to be able to write patterns in td files for copies
Definition: README.txt:158
llvm::AMDGPU::Exp::Target
Target
Definition: SIDefines.h:739
ppc32
This would be a win on ppc32
Definition: README.txt:37
T
#define T
Definition: Mips16ISelLowering.cpp:341
libcxxabi
Itanium Name Demangler i e convert the string _Z1fv into but neither can depend on each other libcxxabi needs the demangler to implement which is part of the itanium ABI spec LLVM needs a copy for a bunch of but doesn t want to use the system s __cxa_demangle because it a might not be and b probably isn t that up to date on the latest language features The copy of the demangler in LLVM has some extra stuff that aren t needed in which depend on the shared generic components Despite these we want to keep the core generic demangling library identical between both copies to simplify development and testing If you re working on the generic then do the work first in libcxxabi
Definition: README.txt:30
i1
Decimal Convert From to National Zoned Signed int_ppc_altivec_bcdcfno i1
Definition: README_P9.txt:147
ret
to esp esp setne al movzbw ax esp setg cl movzbw cx cmove cx cl jne LBB1_2 esp ret(also really horrible code on ppc). This is due to the expand code for 64-bit compares. GCC produces multiple branches
comparison
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each comparison
Definition: README.txt:401
a3
This is blocked on not handling X *X *X which is the same number of multiplies and is because the *X has multiple uses Here s a simple X1 B ret i32 C Reassociate should handle the example in GCC a3
Definition: README.txt:84
at
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional ldr LCPI1_0 ldr ldr tst movne lsr ldr LCPI1_1 and r0 bx lr it saves an instruction and a register It might be profitable to cse MOVi16 if there are lots of bit immediates with the same bottom half Robert Muth started working on an alternate jump table implementation that does not put the tables in line in the text This is more like the llvm default jump table implementation This might be useful sometime Several revisions of patches are on the mailing beginning at
Definition: README.txt:582
llvm::sys::fs::access
std::error_code access(const Twine &Path, AccessMode Mode)
Can the file be accessed?
fh
float fh
Definition: README.txt:292
llvm::errs
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition: raw_ostream.cpp:892
idiom
hexagon loop idiom
Definition: HexagonLoopIdiomRecognition.cpp:285
g
should just be implemented with a CLZ instruction Since there are other e g
Definition: README.txt:709
tmp1
urem i32 %X, 255 ret i32 %tmp1 } Currently it compiles to:... movl $2155905153, %ecx movl 8(%esp), %esi movl %esi, %eax mull %ecx ... This could be "reassociated" into:movl $2155905153, %eax movl 8(%esp), %ecx mull %ecx to avoid the copy. In fact, the existing two-address stuff would do this except that mul isn 't a commutative 2-addr instruction. I guess this has to be done at isel time based on the #uses to mul? Make sure the instruction which starts a loop does not cross a cacheline boundary. This requires knowning the exact length of each machine instruction. That is somewhat complicated, but doable. Example 256.bzip2:In the new trace, the hot loop has an instruction which crosses a cacheline boundary. In addition to potential cache misses, this can 't help decoding as I imagine there has to be some kind of complicated decoder reset and realignment to grab the bytes from the next cacheline. 532 532 0x3cfc movb(1809(%esp, %esi), %bl<<<--- spans 2 64 byte lines 942 942 0x3d03 movl %dh,(1809(%esp, %esi) 937 937 0x3d0a incl %esi 3 3 0x3d0b cmpb %bl, %dl 27 27 0x3d0d jnz 0x000062db< main+11707 > In c99 mode, the preprocessor doesn 't like assembly comments like #TRUNCATE. This could be a single 16-bit load. int f(char *p) { if((p[0]==1) &(p[1]==2)) return 1 tmp1
Definition: README.txt:375
pass
we should consider alternate ways to model stack dependencies Lots of things could be done in WebAssemblyTargetTransformInfo cpp there are numerous optimization related hooks that can be overridden in WebAssemblyTargetLowering Instead of the OptimizeReturned pass
Definition: README.txt:73
that
we should consider alternate ways to model stack dependencies Lots of things could be done in WebAssemblyTargetTransformInfo cpp there are numerous optimization related hooks that can be overridden in WebAssemblyTargetLowering Instead of the OptimizeReturned which should consider preserving the returned attribute through to MachineInstrs and extending the MemIntrinsicResults pass to do this optimization on calls too That would also let the WebAssemblyPeephole pass clean up dead defs for such as it does for stores Consider implementing and or getMachineCombinerPatterns Find a clean way to fix the problem which leads to the Shrink Wrapping pass being run after the WebAssembly PEI pass When setting multiple variables to the same we currently get code like const It could be done with a smaller encoding like local tee $pop5 local $pop6 WebAssembly registers are implicitly initialized to zero Explicit zeroing is therefore often redundant and could be optimized away Small indices may use smaller encodings than large indices WebAssemblyRegColoring and or WebAssemblyRegRenumbering should sort registers according to their usage frequency to maximize the usage of smaller encodings Many cases of irreducible control flow could be transformed more optimally than via the transform in WebAssemblyFixIrreducibleControlFlow cpp It may also be worthwhile to do transforms before register particularly when duplicating to allow register coloring to be aware of the duplication WebAssemblyRegStackify could use AliasAnalysis to reorder loads and stores more aggressively WebAssemblyRegStackify is currently a greedy algorithm This means that
Definition: README.txt:130
checking
bounds checking
Definition: BoundsChecking.cpp:249
assertion
we compile this esp call L1 $pb L1 esp je LBB1_2 esp ret but is currently always computed in the entry block It would be better to sink the picbase computation down into the block for the assertion
Definition: README.txt:422
tmp
alloca< 16 x float >, align 16 %tmp2=alloca< 16 x float >, align 16 store< 16 x float > %A,< 16 x float > *%tmp %s=bitcast< 16 x float > *%tmp to i8 *%s2=bitcast< 16 x float > *%tmp2 to i8 *call void @llvm.memcpy.i64(i8 *%s, i8 *%s2, i64 64, i32 16) %R=load< 16 x float > *%tmp2 ret< 16 x float > %R } declare void @llvm.memcpy.i64(i8 *nocapture, i8 *nocapture, i64, i32) nounwind which compiles to:_foo:subl $140, %esp movaps %xmm3, 112(%esp) movaps %xmm2, 96(%esp) movaps %xmm1, 80(%esp) movaps %xmm0, 64(%esp) movl 60(%esp), %eax movl %eax, 124(%esp) movl 56(%esp), %eax movl %eax, 120(%esp) movl 52(%esp), %eax< many many more 32-bit copies > movaps(%esp), %xmm0 movaps 16(%esp), %xmm1 movaps 32(%esp), %xmm2 movaps 48(%esp), %xmm3 addl $140, %esp ret On Nehalem, it may even be cheaper to just use movups when unaligned than to fall back to lower-granularity chunks. Implement processor-specific optimizations for parity with GCC on these processors. GCC does two optimizations:1. ix86_pad_returns inserts a noop before ret instructions if immediately preceded by a conditional branch or is the target of a jump. 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of code contains more than 3 branches. The first one is done for all AMDs, Core2, and "Generic" The second one is done for:Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, Core 2, and "Generic" Testcase:int x(int a) { return(a &0xf0)> >4 tmp
Definition: README.txt:1347
llvm::lltok::less
@ less
Definition: LLToken.h:32
Likewise
Should compile r2 movcc movcs str strb mov lr r1 movcs movcc mov lr r1 str mov mov cmp r1 movlo r2 str bx lr r0 mov mov cmp r0 movhs r2 mov r1 bx lr Some of the NEON intrinsics may be appropriate for more general either as target independent intrinsics or perhaps elsewhere in the ARM backend Some of them may also be lowered to target independent and perhaps some new SDNodes could be added For and absolute value operations are well defined and standard both for vector and scalar types The current NEON specific intrinsics for count leading zeros and count one bits could perhaps be replaced by the target independent ctlz and ctpop intrinsics It may also make sense to add a target independent ctls intrinsic for count leading sign bits Likewise
Definition: README.txt:495
intrinsic
QP Compare Ordered outs ins xscmpudp No intrinsic
Definition: README_P9.txt:303
handle
then ret i32 result Tail recursion elimination should handle
Definition: README.txt:355
generates
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq rsp movq rsp movq rsp movq rsp movq rsp rax movq rsp rax movq rsp rsp rsp eax eax jbe LBB1_3 rcx rax movq rsp eax rsp ret ecx eax rcx movl rsp jmp LBB1_2 gcc generates
Definition: README.txt:1153
replace
static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New)
Definition: ConstantMerge.cpp:116
and
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 and
Definition: README.txt:1271
x86
Note that only the low bits of effective_addr2 are used On bit we don t eliminate the computation of the top half of effective_addr2 because we don t have whole function selection dags On x86
Definition: README.txt:318
SpecialSubKind::allocator
@ allocator
full_add
Should compile to use and accumulates this with a bit value We currently get this with both v4 and r0 smull r2 adds r1 adc r0 bx lr bool full_add(unsigned a, unsigned b)
Definition: README.txt:441
preds
preds
Definition: README.txt:340
p
the resulting code requires compare and branches when and if * p
Definition: README.txt:396
movzwl
The following code is currently eax eax ecx jb LBB1_2 eax movzbl eax ret eax ret We could change the eax into movzwl(%esp)
llvm::LegalityPredicates::any
Predicate any(Predicate P0, Predicate P1)
True iff P0 or P1 are true.
Definition: LegalizerInfo.h:241
a4
This is blocked on not handling X *X *X which is the same number of multiplies and is because the *X has multiple uses Here s a simple X1 B ret i32 C Reassociate should handle the example in GCC a4
Definition: README.txt:84
sinking
Machine code sinking
Definition: MachineSink.cpp:265
size_t
http
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s http
Definition: README.txt:532
like
Should compile to something like
Definition: README.txt:19
Demangler
itanium_demangle::ManglingParser< DefaultAllocator > Demangler
Definition: ItaniumDemangle.cpp:324
attrs
function attrs
Definition: FunctionAttrs.cpp:1676
least
Instead of the following for memset char edx edx edx It might be better to generate eax movl edx movl edx movw edx when we can spare a register It reduces code size Evaluate what the best way to codegen sdiv C is For we currently get ret i32 Y eax movl ecx ecx ecx addl eax eax ret GCC knows several different ways to codegen one of which is eax eax ecx cmovle eax eax ret which is probably but it s interesting at least
Definition: README.txt:166
F
#define F(x, y, z)
Definition: MD5.cpp:56
loop
Analysis the ScalarEvolution expression for r is< loop > Outside the loop
Definition: README.txt:8
wrapped
is currently compiled esp esp jne LBB1_1 esp ret esp esp jne L_abort $stub esp ret This can be applied to any no return function call that takes no arguments etc the stack save restore logic could be shrink wrapped
Definition: README.txt:412
extended
we should consider alternate ways to model stack dependencies Lots of things could be done in WebAssemblyTargetTransformInfo cpp there are numerous optimization related hooks that can be overridden in WebAssemblyTargetLowering Instead of the OptimizeReturned which should consider preserving the returned attribute through to MachineInstrs and extending the MemIntrinsicResults pass to do this optimization on calls too That would also let the WebAssemblyPeephole pass clean up dead defs for such as it does for stores Consider implementing and or getMachineCombinerPatterns Find a clean way to fix the problem which leads to the Shrink Wrapping pass being run after the WebAssembly PEI pass When setting multiple variables to the same we currently get code like const It could be done with a smaller encoding like local tee $pop5 local $pop6 WebAssembly registers are implicitly initialized to zero Explicit zeroing is therefore often redundant and could be optimized away Small indices may use smaller encodings than large indices WebAssemblyRegColoring and or WebAssemblyRegRenumbering should sort registers according to their usage frequency to maximize the usage of smaller encodings Many cases of irreducible control flow could be transformed more optimally than via the transform in WebAssemblyFixIrreducibleControlFlow cpp It may also be worthwhile to do transforms before register particularly when duplicating to allow register coloring to be aware of the duplication WebAssemblyRegStackify could use AliasAnalysis to reorder loads and stores more aggressively WebAssemblyRegStackify is currently a greedy algorithm This means for a binary however wasm doesn t actually require this WebAssemblyRegStackify could be extended
Definition: README.txt:149
llvm::tgtok::plus
@ plus
Definition: TGLexer.h:37
llvm::lltok::equal
@ equal
Definition: LLToken.h:25
instructions
print must be executed print the must be executed context for all instructions
Definition: MustExecute.cpp:355
remainder
div rem Hoist decompose integer division and remainder
Definition: DivRemPairs.cpp:427
a
=0.0 ? 0.0 :(a > 0.0 ? 1.0 :-1.0) a
Definition: README.txt:489
r1
__Z6slow4bii r1 movgt r1
Definition: README.txt:62
floor
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g floor
Definition: README-FPStack.txt:54
always
bar al al movzbl eax ret Missed when stored in a memory are stored as single byte objects the value of which is always(false) or 1(true). We are not using this fact
Definition: README.txt:1412
different
Code Generation Notes for reduce the size of the ISel and reduce repetition in the implementation In a small number of this can cause different(semantically equivalent) instructions to be used in place of the requested instruction
way
should just be implemented with a CLZ instruction Since there are other e that share this it would be best to implement this in a target independent way
Definition: README.txt:720
clang
The object format emitted by the WebAssembly backed is documented see the home and packaging for producing WebAssembly applications that can run in browsers and other environments wasi sdk provides a more minimal C C SDK based on clang
Definition: README.txt:19
clear
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:233
sbbl
Add support for conditional and other related patterns Instead eax eax je LBB16_2 eax edi sbbl
Definition: README.txt:144
checks
Implicit null checks
Definition: ImplicitNullChecks.cpp:822
rax
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq rsp movq rsp movq rsp movq rsp movq rsp rax movq rax
Definition: README.txt:1129
here
A predicate compare being used in a select_cc should have the same peephole applied to it as a predicate compare used by a br_cc There should be no mfcr here
Definition: README_ALTIVEC.txt:147
processors
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra but it s one instruction and doesn t stress bit subreg eax eax movl edx edx sall eax sall cl edx bit we should expand to a conditional branch like GCC produces Some isel and Sequencing of Instructions Scheduling for reduced register pressure E g Minimum Register Instruction Sequence load p Because the compare isn t it is not matched with the load on both sides The dag combiner should be made smart enough to canonicalize the load into the RHS of a compare when it can invert the result of the compare for free In many LLVM generates code like eax cmpl esp setl al movzbl eax ret on some processors(which ones?)
llvm::sys::fs::equivalent
bool equivalent(file_status A, file_status B)
Do file_status's represent the same thing?
reg1039
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 reg1039
Definition: README.txt:36
StringView::substr
StringView substr(size_t Pos, size_t Len=npos) const
Definition: StringView.h:39
LBB1_2
entry stw r5 blr GCC r3 srawi xor r4 subf r0 stw r5 blr which is much nicer This theoretically may help improve twolf li blt LBB1_2
Definition: README.txt:201
idioms
hexagon loop Recognize Hexagon specific loop idioms
Definition: HexagonLoopIdiomRecognition.cpp:286
r4
Common register allocation spilling r4
Definition: README.txt:5
R2
#define R2(n)
bb
< i1 > br i1 label label bb bb
Definition: README.txt:978
simplification
hexagon bit Hexagon bit simplification
Definition: HexagonBitSimplify.cpp:262
i64
Clang compiles this i64
Definition: README.txt:504
CSE
separate const offset from Split GEPs to a variadic base and a constant offset for better CSE
Definition: SeparateConstOffsetFromGEP.cpp:496
lo16
We lo16(.CPI_X_0)(r2) lis r2
indvars
indvars
Definition: IndVarSimplify.cpp:1974
turning
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g turning
Definition: README-FPStack.txt:54
no_overflow
bool no_overflow(unsigned a, unsigned b)
Definition: README.txt:443
select
into xmm2 addss xmm2 xmm1 xmm3 addss xmm3 movaps xmm0 unpcklps xmm0 ret seems silly when it could just be one addps Expand libm rounding functions main should enable SSE DAZ mode and other fast SSE modes Think about doing i64 math in SSE regs on x86 This testcase should have no SSE instructions in and only one load from a constant double ret double C the select is being which prevents the dag combiner from turning select(load CPI1)
So
So we should use XX3Form_Rcr to implement instrinsic Convert DP outs ins xscvdpsp So
Definition: README_P9.txt:331
movw
Instead of the following for memset char edx edx movw
Definition: README.txt:121
LCPI1_0
cond_true lis lo16() LCPI1_0(r2) lis r2
l0
To do *Keep the address of the constant pool in a register instead of forming its address all of the time *We can fold small constant offsets into the hi lo references to constant pool addresses as well *When in V9 register allocate icc *[0-3] Add support for isel ing UMUL_LOHI instead of marking it as Expand *Emit the Branch on Integer Register with Prediction instructions It s not clear how to write a pattern for this int o6 subcc l0 bne LBBt1_2 ! F nop l0 st l0
Definition: README.txt:31
L2
add sub stmia L5 ldr L2
Definition: README.txt:201
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:679
bits
demanded bits
Definition: DemandedBits.cpp:63
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
available
This requires reassociating to forms of expressions that are already available
Definition: README.txt:92
rsi
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq rsp movq rsp movq rsp movq rsp movq rsi
Definition: README.txt:1127
s2
_bar mov r0 mov r1 fldd LCPI1_0 fmrrd d0 bl _foo fmdrr r5 fmsr s2
Definition: README.txt:159
shl
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 shl
Definition: README.txt:1271
them
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n y store obj obj **nth_el If the i64 division is lowered to a then a safe point array and nth_el no longer point into the correct object The fix for this is to copy address calculations so that dependent pointers are never live across safe point boundaries But the loads cannot be copied like this if there was an intervening so may be hard to get right Only a concurrent mutator can trigger a collection at the libcall safe point So single threaded programs do not have this even with a copying collector LLVM optimizations would probably undo a front end s careful work The ocaml frametable structure supports liveness information It would be good to support it The FIXME in ComputeCommonTailLength in BranchFolding cpp needs to be revisited The check is there to work around a misuse of directives in assembly It would be good to detect collector target compatibility instead of silently doing the wrong thing It would be really nice to be able to write patterns in td files for which would eliminate a bunch of predicates on them(e.g. no side effects). Once this is in place
f
Itanium Name Demangler i e convert the string _Z1fv into f()". You can also use the CRTP base ManglingParser to perform some simple analysis on the mangled name
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
mulss
esp eax movl ecx ecx cvtsi2ss xmm0 eax cvtsi2ss xmm1 mulss
Definition: README.txt:304
edx
sar eax, 31) more aggressively edx
Definition: README.txt:923
StringView::consumeFront
bool consumeFront(char C)
Definition: StringView.h:82
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
cmp
< i32 >< i32 > cmp
Definition: README.txt:1447
paths
Reduce control height in the hot paths
Definition: ControlHeightReduction.cpp:138
LBB1_1
we compile this esp call L1 $pb L1 esp je LBB1_2 LBB1_1
Definition: README.txt:414
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
int
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s LLVM produces ret int
Definition: README.txt:536
old
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n old
Definition: README.txt:123
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
tmp614615
< i32 > tmp614615
Definition: README.txt:563
exceptions
Prepare DWARF exceptions
Definition: DwarfEHPrepare.cpp:346
test
Definition: README.txt:37
shorter
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra but it s one instruction shorter
Definition: README.txt:31
tmp585
< i32 > tmp585
Definition: README.txt:562
t
bitcast float %x to i32 %s=and i32 %t, 2147483647 %d=bitcast i32 %s to float ret float %d } declare float @fabsf(float %n) define float @bar(float %x) nounwind { %d=call float @fabsf(float %x) ret float %d } This IR(from PR6194):target datalayout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple="x86_64-apple-darwin10.0.0" %0=type { double, double } %struct.float3=type { float, float, float } define void @test(%0, %struct.float3 *nocapture %res) nounwind noinline ssp { entry:%tmp18=extractvalue %0 %0, 0 t
Definition: README-SSE.txt:788
b
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int b
Definition: README.txt:418
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:735
C
This is blocked on not handling X *X *X which is the same number of multiplies and is because the *X has multiple uses Here s a simple X1 * C
Definition: README.txt:75
llvm::AMDGPU::PALMD::Key
Key
PAL metadata keys.
Definition: AMDGPUMetadata.h:481
acc
acc
Definition: README.txt:420
group
bb420 i The CBE manages to mtctr r0 r11 stbx r9 addi bdz later b loop This could be much the loop would be a single dispatch group
Definition: README.txt:61
test
int test(U32 *inst, U64 *regs)
Definition: README.txt:301
bar
bar al al movzbl eax ret GCC produces bar
Definition: README.txt:1434
leal
Instead of the following for memset char edx edx edx It might be better to generate eax movl edx movl edx movw edx when we can spare a register It reduces code size Evaluate what the best way to codegen sdiv C is For we currently get ret i32 Y eax movl ecx ecx ecx addl eax eax ret GCC knows several different ways to codegen one of which is eax eax leal(%eax)
merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition: LoopDeletion.cpp:51
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
SDNodes
Should compile r2 movcc movcs str strb mov lr r1 movcs movcc mov lr r1 str mov mov cmp r1 movlo r2 str bx lr r0 mov mov cmp r0 movhs r2 mov r1 bx lr Some of the NEON intrinsics may be appropriate for more general either as target independent intrinsics or perhaps elsewhere in the ARM backend Some of them may also be lowered to target independent SDNodes
Definition: README.txt:486
Integer
So we should use XX3Form_Rcr to implement instrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision Integer
Definition: README_P9.txt:366
opt
arm prera ldst opt
Definition: ARMLoadStoreOptimizer.cpp:2191
StringView::npos
static const size_t npos
Definition: StringView.h:28
LBB2_2
So that lo16() r2 stb r3 blr Becomes r3 they should compile to something better r3 subfic cmpwi bgt LBB2_2
Definition: README.txt:164
metadata
Add AMDGPU uniform metadata
Definition: AMDGPUAnnotateUniformValues.cpp:62
testb
http eax xorl edx testb
Definition: README.txt:22
in
The object format emitted by the WebAssembly backed is documented in
Definition: README.txt:11
ll
Analysis the ScalarEvolution expression for r is< loop > Outside the this could be evaluated simply however ScalarEvolution currently evaluates it it involves i65 which is very inefficient when expanded into code In formatValue in test CodeGen X86 lsr delayed fold ll
Definition: README.txt:20
llvm::AArch64_AM::LSR
@ LSR
Definition: AArch64AddressingModes.h:35
be
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can be
Definition: README.txt:14
StringView::popFront
char popFront()
Definition: StringView.h:77
above
To do *Keep the address of the constant pool in a register instead of forming its address all of the time *We can fold small constant offsets into the hi lo references to constant pool addresses as well *When in V9 register allocate icc *[0-3] Add support for isel ing UMUL_LOHI instead of marking it as Expand *Emit the Branch on Integer Register with Prediction instructions It s not clear how to write a pattern for this int o6 subcc l0 bne LBBt1_2 ! F nop l0 st g0 retl nop should be replaced with a brz in V9 mode *Same as above
Definition: README.txt:40
r8
bb420 i lbzx r8
Definition: README.txt:39
r0
__Z6slow4bii r1 movgt r0 mov r0
Definition: README.txt:63
optimizations
Moving arg1 onto the stack slot of callee function would overwrite arg2 of the caller Possible optimizations
Definition: README.txt:700
llvm::ISD::FRINT
@ FRINT
Definition: ISDOpcodes.h:881
lr
Common register allocation spilling lr str lr
Definition: README.txt:6
llvm::tgtok::If
@ If
Definition: TGLexer.h:51
finite
We don t delete this output free because trip count analysis doesn t realize that it is finite(if it were infinite, it would be undefined). Not having this blocks Loop Idiom from matching strlen and friends. void foo(char *C)
Definition: README.txt:206
IR
Statically lint checks LLVM IR
Definition: Lint.cpp:744
llvm::ilist_detail::make
T & make()
b2
int b2
Definition: README.txt:84
analysis
demanded Demanded bits analysis
Definition: DemandedBits.cpp:64
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:632
Finally
is currently compiled esp esp jne LBB1_1 esp ret esp esp jne L_abort $stub esp ret This can be applied to any no return function call that takes no arguments etc the stack save restore logic could be shrink producing something like esp jne LBB1_1 ret esp call L_abort $stub Both are useful in different situations Finally
Definition: README.txt:423
coloring
we should consider alternate ways to model stack dependencies Lots of things could be done in WebAssemblyTargetTransformInfo cpp there are numerous optimization related hooks that can be overridden in WebAssemblyTargetLowering Instead of the OptimizeReturned which should consider preserving the returned attribute through to MachineInstrs and extending the MemIntrinsicResults pass to do this optimization on calls too That would also let the WebAssemblyPeephole pass clean up dead defs for such as it does for stores Consider implementing and or getMachineCombinerPatterns Find a clean way to fix the problem which leads to the Shrink Wrapping pass being run after the WebAssembly PEI pass When setting multiple variables to the same we currently get code like const It could be done with a smaller encoding like local tee $pop5 local $pop6 WebAssembly registers are implicitly initialized to zero Explicit zeroing is therefore often redundant and could be optimized away Small indices may use smaller encodings than large indices WebAssemblyRegColoring and or WebAssemblyRegRenumbering should sort registers according to their usage frequency to maximize the usage of smaller encodings Many cases of irreducible control flow could be transformed more optimally than via the transform in WebAssemblyFixIrreducibleControlFlow cpp It may also be worthwhile to do transforms before register coloring
Definition: README.txt:119
modes
*Add support for compiling functions in both ARM and Thumb then taking the smallest *Add support for compiling individual basic blocks in thumb when in a larger ARM function This can be used for presumed cold like paths to EH handling etc *Thumb doesn t have normal pre post increment addressing modes
Definition: README-Thumb.txt:12
reduce
loop reduce
Definition: LoopStrengthReduce.cpp:6382
expressions
aggressive Combine pattern based expressions
Definition: AggressiveInstCombine.cpp:450
input
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 input
Definition: README.txt:10
Register
Promote Memory to Register
Definition: Mem2Reg.cpp:110
llvm::NVPTX::PTXLdStInstCode::Scalar
@ Scalar
Definition: NVPTX.h:122
LCPI1_2
cond_true lis lo16() lo16() LCPI1_2(r3) lfs f3
force
therefore end up llgh r3 lr r0 br r14 but truncating the load would lh r3 br r14 Functions ret i64 and ought to be implemented ngr r0 br r14 but two address optimizations reverse the order of the AND and force
Definition: README.txt:112
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
pshufd
Instead we get xmm1 addss xmm1 pshufd
Definition: README-SSE.txt:35
only
dot regions only
Definition: RegionPrinter.cpp:205
place
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first place
Definition: README.txt:50
StringView::end
const char * end() const
Definition: StringView.h:107
L6
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq rsp movq rsp movq rsp movq rsp movq rsp rax movq rsp rax movq rsp rsp rsp eax eax jbe LBB1_3 rcx rax movq rsp eax rsp ret ecx eax rcx movl rsp jmp LBB1_2 gcc rsp rax movq rsp rsp movq rsp rax movq rsp eax eax jb L6 rdx eax rsp ret p2align L6
Definition: README.txt:1168
xmm6
gets compiled into this on rsp movaps rsp movaps xmm6
Definition: README.txt:1116
L10
gcc compiles this esp xorl eax jmp L2 eax je L10 eax eax jne L3 call L_abort $stub L10
Definition: README.txt:747
llvm::demangle
std::string demangle(const std::string &MangledName)
Attempt to demangle a string using different demangling schemes.
Definition: Demangle.cpp:27
too
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 the input will be treated as an leaving the upper bits uninitialised For i64 store i32 i32 *dst ret void from CodeGen SystemZ asm ll will use LHI rather than LGHI to load This seems to be a general target independent problem The tuning of the choice between LOAD XC and CLC for constant length block operations We could extend them to variable length operations too
Definition: README.txt:40
will
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n y store obj obj **nth_el If the i64 division is lowered to a then a safe point will(must) appear for the call site. If a collection occurs
foo
< i32 > tmp foo
Definition: README.txt:383
one
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only one
Definition: README.txt:401
StringView::startsWith
bool startsWith(StringView Str) const
Definition: StringView.h:98
jump
The object format emitted by the WebAssembly backed is documented see the home and packaging for producing WebAssembly applications that can run in browsers and other environments wasi sdk provides a more minimal C C SDK based on llvm and a libc based on for producing WebAssemmbly applictions that use the WASI ABI Rust provides WebAssembly support integrated into Cargo There are two main which provides a relatively minimal environment that has an emphasis on being native wasm32 unknown which uses Emscripten internally and provides standard C C filesystem GL and SDL bindings For more and br_table instructions can support having a value on the value stack across the jump(sometimes). We should(a) model this
into
Clang compiles this into
Definition: README.txt:504
Instead
Current eax eax eax ret Ideal eax eax ret Re implement atomic builtins x86 does not have to use add to implement these Instead
Definition: README.txt:1366
i32
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32
Definition: README.txt:122
xmm5
gets compiled into this on rsp movaps rsp movaps rsp movaps xmm5
Definition: README.txt:1117
cases
this could be done in SelectionDAGISel along with other special cases
Definition: README.txt:103
bad
We currently generate an sqrtsd and divsd instructions This is bad
Definition: README-SSE.txt:820
llvm::tgtok::In
@ In
Definition: TGLexer.h:51
Opportunities
Analysis Opportunities
Definition: README.txt:5
$src
So we should use XX3Form_Rcr to implement instrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store load store see def memrix16 in PPCInstrInfo td Load Store Vector load store outs ins lxsdx $src
Definition: README_P9.txt:512
val
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 the input will be treated as an leaving the upper bits uninitialised For i64 store i32 val
Definition: README.txt:15
xmm0
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps xmm0
Definition: README.txt:517
ADDRESS
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 the input will be treated as an leaving the upper bits uninitialised For i64 store i32 i32 *dst ret void from CodeGen SystemZ asm ll will use LHI rather than LGHI to load This seems to be a general target independent problem The tuning of the choice between LOAD ADDRESS(LA) and addition in SystemZISelDAGToDAG.cpp is suspect. It should be tweaked based on performance measurements. -- There is no scheduling support. -- We don 't use the BRANCH ON INDEX instructions. -- We only use MVC
t
We currently generate a but we really shouldn t
Definition: README.txt:1225
v8i8
def v8i8(MMX_MOVDQ2Qrr VR128:$src))>
readonly
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from where P can be anything The alignment inference code cannot handle loads from globals in static non mode because it doesn t look through the extra dyld stub load If you try vec_align ll without relocation you ll see what I mean We should lower which eliminates a constant pool load For float z nounwind readonly
Definition: README-SSE.txt:421
allocation
Eliminate PHI nodes for register allocation
Definition: PHIElimination.cpp:136
UXTAB16
Reimplement select in terms of SEL *We would really like to support UXTAB16
Definition: README.txt:7
query
static void query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read, bool &Write, bool &Effects, bool &StackPointer)
Definition: WebAssemblyRegStackify.cpp:167
SMLAL
Should compile to use SMLAL(Signed Multiply Accumulate Long) which multiplies two signed 32-bit values to produce a 64-bit value
llvm::lto::backend
Error backend(const Config &C, AddStreamFn AddStream, unsigned ParallelCodeGenParallelismLevel, Module &M, ModuleSummaryIndex &CombinedIndex)
Runs a regular LTO backend.
Definition: LTOBackend.cpp:505
needed
We should do a little better with eliminating dead stores The stores to the stack are dead since a and b are not needed
Definition: README_ALTIVEC.txt:212
object
bar al al movzbl eax ret Missed when stored in a memory object
Definition: README.txt:1411
Verify
ppc ctr loops PowerPC CTR Loops Verify
Definition: PPCCTRLoops.cpp:77
DEMANGLE_NAMESPACE_END
#define DEMANGLE_NAMESPACE_END
Definition: DemangleConfig.h:90
llvm::cl::values
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
Definition: CommandLine.h:699
type
AMD64 Optimization Manual has some nice information about optimizing integer multiplication by a constant How much of it applies to Intel s X86 implementation There are definite trade offs to xmm0 cvttss2siq rdx jb L3 subss xmm0 rax cvttss2siq rdx xorq rdx rax ret instead of xmm1 cvttss2siq rcx movaps xmm2 subss xmm2 cvttss2siq rax rdx xorq rax ucomiss xmm0 cmovb rax ret Seems like the jb branch has high likelihood of being taken It would have saved a few instructions It s not possible to reference and DH registers in an instruction requiring REX prefix divb and mulb both produce results in AH If isel emits a CopyFromReg which gets turned into a movb and that can be allocated a r8b r15b To get around isel emits a CopyFromReg from AX and then right shift it down by and truncate it It s not pretty but it works We need some register allocation magic to make the hack go which would often require a callee saved register Callees usually need to keep this value live for most of their body so it doesn t add a significant burden on them We currently implement this in however this is suboptimal because it means that it would be quite awkward to implement the optimization for callers A better implementation would be to relax the LLVM IR rules for sret arguments to allow a function with an sret argument to have a non void return type
Definition: README-X86-64.txt:70
inc
Current eax eax eax ret Ideal eax eax ret Re implement atomic builtins x86 does not have to use add to implement these it can use inc
Definition: README.txt:1367
expand
static Expected< BitVector > expand(StringRef S, StringRef Original)
Definition: GlobPattern.cpp:27
a1
This is blocked on not handling X *X *X which is the same number of multiplies and is because the *X has multiple uses Here s a simple X1 B ret i32 C Reassociate should handle the example in GCC a1
Definition: README.txt:84
llvm::count
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1630
following
This is equivalent to the following
Definition: README.txt:671
D
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
intrinsics
expand Expand reduction intrinsics
Definition: ExpandReductions.cpp:200
const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232
LBB2_1
entry LBB2_1
Definition: README.txt:165
llvm::ISD::LOAD
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:921
tmp10
< i128 > tmp10
Definition: README-SSE.txt:791
__const_coal
the multiplication has a latency of four as opposed to two cycles for the movl lea variant It appears gcc place string data with linkonce linkage in section __const_coal
Definition: README.txt:259
llvm::LegalityPredicates::all
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
Definition: LegalizerInfo.h:228
s
multiplies can be turned into SHL s
Definition: README.txt:370
extend
_test eax xmm0 eax xmm1 comiss xmm1 setae al movzbl ecx eax edx ecx cmove eax ret Note the cmove can be replaced with a single cmovae There are a number of issues We are introducing a setcc between the result of the intrisic call and select The intrinsic is expected to produce a i32 value so a any extend(which becomes a zero extend) is added. We probably need some kind of target DAG combine hook to fix this. We generate significantly worse code for this than GCC
Definition: README.txt:213
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
though
test_demangle and llvm unittest Demangle The llvm directory should only get tests for stuff not included in the core library In the future though
Definition: README.txt:48
StringView::StringView
StringView(const char *Str)
Definition: StringView.h:36
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
block
Note that only the low bits of effective_addr2 are used On bit we don t eliminate the computation of the top half of effective_addr2 because we don t have whole function selection dags On this means we use one extra register for the function when effective_addr2 is declared as U64 than when it is declared U32 PHI Slicing could be extended to do this Tail call elim should be more checking to see if the call is followed by an uncond branch to an exit block
Definition: README.txt:329
__DATA
the multiplication has a latency of four as opposed to two cycles for the movl lea variant It appears gcc place string data with linkonce linkage in section coalesced instead of section __DATA
Definition: README.txt:260
b3
int b3
Definition: README.txt:84
LBB1_83
bb432 i LBB1_83
Definition: README.txt:38
semantics
Vector Shift Left don t map to llvm shl and because they have different semantics
Definition: README_P9.txt:119
I
#define I(x, y, z)
Definition: MD5.cpp:59
something
This might compile to this xmm1 xorps xmm0 movss xmm0 ret Now consider if the code caused xmm1 to get spilled This might produce this xmm1 movaps xmm0 movaps xmm1 movss xmm0 ret since the reload is only used by these we could fold it into the producing something like xmm1 movaps xmm0 ret saving two instructions The basic idea is that a reload from a spill if only one byte chunk is bring in zeros the one element instead of elements This can be used to simplify a variety of shuffle where the elements are fixed zeros This code generates ugly probably due to costs being off or something
Definition: README-SSE.txt:278
cr0
bb432 i mr r6 cmplwi cr0
Definition: README.txt:44
legalization
Combine AArch64 machine instrs before legalization
Definition: AArch64O0PreLegalizerCombiner.cpp:166
sub
we currently eax ecx subl eax ret We would use one fewer register if codegen d eax neg eax eax ret Note that this isn t beneficial if the load can be folded into the sub In this we want a sub
Definition: README.txt:460
call
S is passed via registers r2 But gcc stores them to the and then reload them to and r3 before issuing the call(r0 contains the address of the format string)
Definition: README.txt:190
movaps
_test eax movaps(%eax)
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
CLZ
This should be recognized as CLZ
Definition: README.txt:238
node
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper node
Definition: README-SSE.txt:406
processor
it should cost the same as a move shift on any modern processor
Definition: README.txt:924
size
i< reg-> size
Definition: README.txt:166
README
llvm lib Support Unix README
Definition: README.txt:2
generated
The following code is currently generated
Definition: README.txt:954
a2
This is blocked on not handling X *X *X which is the same number of multiplies and is because the *X has multiple uses Here s a simple X1 B ret i32 C Reassociate should handle the example in GCC a2
Definition: README.txt:84
reg1038
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 reg1038
Definition: README.txt:36
LCPI1_0
> ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc) We compile the following:define i16 @func_entry_2E_ce(i32 %i) { switch i32 %i, label %bb12.exitStub[i32 0, label %bb4.exitStub i32 1, label %bb9.exitStub i32 2, label %bb4.exitStub i32 3, label %bb4.exitStub i32 7, label %bb9.exitStub i32 8, label %bb.exitStub i32 9, label %bb9.exitStub] bb12.exitStub:ret i16 0 bb4.exitStub:ret i16 1 bb9.exitStub:ret i16 2 bb.exitStub:ret i16 3 } into:_func_entry_2E_ce:mov r2, #1 lsl r2, r0 cmp r0, #9 bhi LBB1_4 @bb12.exitStub LBB1_1:@newFuncRoot mov r1, #13 tst r2, r1 bne LBB1_5 @bb4.exitStub LBB1_2:@newFuncRoot ldr r1, LCPI1_0 tst r2, r1 bne LBB1_6 @bb9.exitStub LBB1_3:@newFuncRoot mov r1, #1 lsl r1, r1, #8 tst r2, r1 bne LBB1_7 @bb.exitStub LBB1_4:@bb12.exitStub mov r0, #0 bx lr LBB1_5:@bb4.exitStub mov r0, #1 bx lr LBB1_6:@bb9.exitStub mov r0, #2 bx lr LBB1_7:@bb.exitStub mov r0, #3 bx lr LBB1_8:.align 2 LCPI1_0:.long 642 gcc compiles to:cmp r0, #9 @ lr needed for prologue bhi L2 ldr r3, L11 mov r2, #1 mov r1, r2, asl r0 ands r0, r3, r2, asl r0 movne r0, #2 bxne lr tst r1, #13 beq L9 L3:mov r0, r2 bx lr L9:tst r1, #256 movne r0, #3 bxne lr L2:mov r0, #0 bx lr L12:.align 2 L11:.long 642 GCC is doing a couple of clever things here:1. It is predicating one of the returns. This isn 't a clear win though:in cases where that return isn 't taken, it is replacing one condbranch with two 'ne' predicated instructions. 2. It is sinking the shift of "1 << i" into the tst, and using ands instead of tst. This will probably require whole function isel. 3. GCC emits:tst r1, #256 we emit:mov r1, #1 lsl r1, r1, #8 tst r2, r1 When spilling in thumb mode and the sp offset is too large to fit in the ldr/str offset field, we load the offset from a constpool entry and add it to sp:ldr r2, LCPI add r2, sp ldr r2,[r2] These instructions preserve the condition code which is important if the spill is between a cmp and a bcc instruction. However, we can use the(potentially) cheaper sequence if we know it 's ok to clobber the condition register. add r2, sp, #255 *4 add r2, #132 ldr r2,[r2, #7 *4] This is especially bad when dynamic alloca is used. The all fixed size stack objects are referenced off the frame pointer with negative offsets. See oggenc for an example. Poor codegen test/CodeGen/ARM/select.ll f7:ldr r5, LCPI1_0 LPC0:add r5, pc ldr r6, LCPI1_1 ldr r2, LCPI1_2 mov r3, r6 mov lr, pc bx r5 Make register allocator/spiller smarter so we can re-materialize "mov r, imm", etc. Almost all Thumb instructions clobber condition code. Thumb load/store address mode offsets are scaled. The values kept in the instruction operands are pre-scale values. This probably ought to be changed to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions. We need to make(some of the) Thumb1 instructions predicable. That will allow shrinking of predicated Thumb2 instructions. To allow this, we need to be able to toggle the 's' bit since they do not set CPSR when they are inside IT blocks. Make use of hi register variants of cmp:tCMPhir/tCMPZhir. Thumb1 immediate field sometimes keep pre-scaled values. See ThumbRegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and Thumb2. Rather than having tBR_JTr print a ".align 2" and constant island pass pad it, add a target specific ALIGN instruction instead. That way, getInstSizeInBytes won 't have to over-estimate. It can also be used for loop alignment pass. We generate conditional code for icmp when we don 't need to. This code:int foo(int s) { return s==1 LCPI1_0
Definition: README-Thumb.txt:249
__TEXT
the multiplication has a latency of four as opposed to two cycles for the movl lea variant It appears gcc place string data with linkonce linkage in section __TEXT
Definition: README.txt:259
ldr
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a ldr
Definition: README.txt:204
mtune
pentium2/3/4/m/etc.:abs:movl 4(%esp), %eax cltd xorl %edx, %eax subl %edx, %eax ret Take the following code(from http:extern unsigned char first_one[65536] mtune
Definition: README.txt:943
maximum
Should compile r2 movcc movcs str strb mov lr r1 movcs movcc mov lr r1 str mov mov cmp r1 movlo r2 str bx lr r0 mov mov cmp r0 movhs r2 mov r1 bx lr Some of the NEON intrinsics may be appropriate for more general either as target independent intrinsics or perhaps elsewhere in the ARM backend Some of them may also be lowered to target independent and perhaps some new SDNodes could be added For maximum
Definition: README.txt:489
false
Clang compiles this i1 false
Definition: README.txt:505
setae
_test eax xmm0 eax xmm1 comiss xmm1 setae al movzbl ecx eax edx ecx cmove eax ret Note the setae
Definition: README.txt:210
r6
bb420 i lbzx r7 addi r6
Definition: README.txt:40
emulation
The object format emitted by the WebAssembly backed is documented see the home and packaging for producing WebAssembly applications that can run in browsers and other environments wasi sdk provides a more minimal C C SDK based on llvm and a libc based on for producing WebAssemmbly applictions that use the WASI ABI Rust provides WebAssembly support integrated into Cargo There are two main which provides a relatively minimal environment that has an emphasis on being native wasm32 unknown which uses Emscripten internally and provides standard C C filesystem emulation
Definition: README.txt:27
_Z11no_overflowjj
Should compile r2 movcc movcs str strb mov lr _Z11no_overflowjj
Definition: README.txt:457
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
StringView::find
size_t find(char C, size_t From=0) const
Definition: StringView.h:44
Still
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n y store obj obj **nth_el If the i64 division is lowered to a then a safe point array and nth_el no longer point into the correct object The fix for this is to copy address calculations so that dependent pointers are never live across safe point boundaries But the loads cannot be copied like this if there was an intervening so may be hard to get right Only a concurrent mutator can trigger a collection at the libcall safe point So single threaded programs do not have this even with a copying collector Still
Definition: README.txt:137
tmp252253
< i32 > tmp252253
Definition: README.txt:558
called
is currently compiled esp esp jne LBB1_1 esp ret esp esp jne L_abort $stub esp ret This can be applied to any no return function call that takes no arguments etc the stack save restore logic could be shrink producing something like esp jne LBB1_1 ret esp call L_abort $stub Both are useful in different situations it could be shrink wrapped and tail called
Definition: README.txt:424
load
LLVM currently emits rax rax movq rax rax ret It could narrow the loads and stores to emit rax rax movq rax rax ret The trouble is that there is a TokenFactor between the store and the load
Definition: README.txt:1531
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
fl
fh *return fh fl
Definition: README.txt:293
base
therefore end up llgh r3 lr r0 br r14 but truncating the load would lh r3 br r14 Functions ret i64 and ought to be implemented ngr r0 br r14 but two address optimizations reverse the order of the AND and ngr r2 lgr r0 br r14 CodeGen SystemZ and ll has several examples of this Out of range displacements are usually handled by loading the full address into a register In many cases it would be better to create an anchor point instead E g i64 base
Definition: README.txt:125
llvm::ScaledNumbers::compare
int compare(DigitsT LDigits, int16_t LScale, DigitsT RDigits, int16_t RScale)
Compare two scaled numbers.
Definition: ScaledNumber.h:251
reduction
Straight line strength reduction
Definition: StraightLineStrengthReduce.cpp:267
abort_gzip
< i32 > ret i32 tmp10 ecx esp je LBB0_2 eax addl eax ret edx movl eax subl eax ret There s an obviously unnecessary movl in and we could eliminate a couple more movls by putting(%esp) into %eax instead of %ecx. See rdar< i1 * > define fastcc void abort_gzip() noreturn nounwind
Definition: README.txt:1060
removed
currently compiles eax eax je LBB0_3 testl eax jne LBB0_4 the testl could be removed
Definition: README.txt:1552
elimination
constraint elimination
Definition: ConstraintElimination.cpp:544
o6
To do *Keep the address of the constant pool in a register instead of forming its address all of the time *We can fold small constant offsets into the hi lo references to constant pool addresses as well *When in V9 register allocate icc *[0-3] Add support for isel ing UMUL_LOHI instead of marking it as Expand *Emit the Branch on Integer Register with Prediction instructions It s not clear how to write a pattern for this int o6
Definition: README.txt:26
i16
< i32 > ret i32 conv5 And the following x86 eax movsbl ecx cmpl ecx sete al movzbl eax ret It should be possible to eliminate the sign extensions LLVM misses a load store narrowing opportunity in this i16
Definition: README.txt:1493
slower
Instead of the following for memset char edx edx edx It might be better to generate eax movl edx movl edx movw edx when we can spare a register It reduces code size Evaluate what the best way to codegen sdiv C is For we currently get ret i32 Y eax movl ecx ecx ecx addl eax eax ret GCC knows several different ways to codegen one of which is eax eax ecx cmovle eax eax ret which is probably slower
Definition: README.txt:161
edi
into eax xorps xmm0 xmm0 eax xmm0 eax xmm0 ret esp eax movdqa xmm0 xmm0 esp const ret align it should be movdqa xmm0 edi
Definition: README-SSE.txt:650
movq
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp movq
Definition: README.txt:521
B
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional ldr LCPI1_0 ldr ldr tst movne lsr ldr LCPI1_1 and r0 bx lr it saves an instruction and a register It might be profitable to cse MOVi16 if there are lots of bit immediates with the same bottom half Robert Muth started working on an alternate jump table implementation that does not put the tables in line in the text This is more like the llvm default jump table implementation This might be useful sometime Several revisions of patches are on the mailing beginning while CMP sets them like a subtract Therefore to be able to use CMN for comparisons other than the Z we ll need additional logic to reverse the conditionals associated with the comparison Perhaps a pseudo instruction for the with a post codegen pass to clean up and handle the condition codes See PR5694 for testcase Given the following on int B
Definition: README.txt:592
example
llvm lib Support Unix the directory structure underneath this directory could look like only those directories actually needing to be created should be created further subdirectories could be created to reflect versions of the various standards For example
Definition: README.txt:15
b1
int b1
Definition: README.txt:84
RA
SI optimize exec mask operations pre RA
Definition: SIOptimizeExecMaskingPreRA.cpp:71
register
should just be implemented with a CLZ instruction Since there are other e that share this it would be best to implement this in a target independent as zero is the default value for the binary encoder e add r0 add r5 Register operands should be distinct That when the encoding does not require two syntactical operands to refer to the same register
Definition: README.txt:726
isel
amdgpu isel
Definition: AMDGPUISelDAGToDAG.cpp:382
pairs
pa Evaluate ProvenanceAnalysis on all pairs
Definition: ProvenanceAnalysisEvaluator.cpp:94
also
Doing so could allow SROA of the destination pointers See also
Definition: README.txt:166
LBB1_84
rlwinm add r4 b LBB1_84
Definition: README.txt:37
xmm3
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps xmm3
Definition: README.txt:1119
info
lazy value info
Definition: LazyValueInfo.cpp:59
gcc
into eax xorps xmm0 xmm0 eax xmm0 eax xmm0 ret gcc
Definition: README-SSE.txt:630
llvm::PatternMatch::m_Value
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
clearbit
void clearbit(int *target, int bit)
Definition: README.txt:111
llvm::sys::fs::remove
std::error_code remove(const Twine &path, bool IgnoreNonExisting=true)
Remove path.
globalopt
globalopt
Definition: GlobalOpt.cpp:2765
rdx
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq rsp movq rsp movq rsp movq rdx
Definition: README.txt:1126
sink
gvn sink
When an instruction is found to only be used outside of the loop, this function moves it to the exit ...
Definition: GVNSink.cpp:924
Ick
rlwinm add r4 Ick
Definition: README.txt:32
r7
add r7
Definition: README.txt:194
r9
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq r9
Definition: README.txt:1123
Control
bool Control
Definition: README.txt:468
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
elements
This compiles xmm1 mulss xmm1 xorps xmm0 movss xmm0 ret Because mulss doesn t modify the top elements
Definition: README-SSE.txt:221
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:179
esp
We currently emits esp
Definition: README.txt:235
this
Analysis the ScalarEvolution expression for r is this
Definition: README.txt:8
list
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional ldr LCPI1_0 ldr ldr tst movne lsr ldr LCPI1_1 and r0 bx lr it saves an instruction and a register It might be profitable to cse MOVi16 if there are lots of bit immediates with the same bottom half Robert Muth started working on an alternate jump table implementation that does not put the tables in line in the text This is more like the llvm default jump table implementation This might be useful sometime Several revisions of patches are on the mailing list
Definition: README.txt:568
StringView::startsWith
bool startsWith(char C) const
Definition: StringView.h:96
cl
http eax xorl edx cl sete al setne dl sall cl
Definition: README.txt:25
m_HotKey
THotKey m_HotKey
libraries
The object format emitted by the WebAssembly backed is documented see the home and packaging for producing WebAssembly applications that can run in browsers and other environments wasi sdk provides a more minimal C C SDK based on llvm and a libc based on for producing WebAssemmbly applictions that use the WASI ABI Rust provides WebAssembly support integrated into Cargo There are two main which provides a relatively minimal environment that has an emphasis on being native wasm32 unknown which uses Emscripten internally and provides standard C C libraries
Definition: README.txt:27
add
we currently eax ecx subl eax ret We would use one fewer register if codegen d eax neg eax add
Definition: README.txt:454
outs
So we should use XX3Form_Rcr to implement instrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store load store see def memrix16 in PPCInstrInfo td Load Store Vector load store outs ins lxsdx set load store with conversion from to outs ins lxsspx set load store outs ins lxsiwzx set PPClfiwzx outs
This returns a reference to a raw_fd_ostream for standard output.
Definition: README_P9.txt:537
rotates
The same transformation can work with an even modulo with the addition of a and shrink the compare RHS by the same amount Unless the target supports rotates
Definition: README.txt:681
A
* A
Definition: README_ALTIVEC.txt:89
library
Itanium Name Demangler i e convert the string _Z1fv into but neither can depend on each other libcxxabi needs the demangler to implement which is part of the itanium ABI spec LLVM needs a copy for a bunch of but doesn t want to use the system s __cxa_demangle because it a might not be and b probably isn t that up to date on the latest language features The copy of the demangler in LLVM has some extra stuff that aren t needed in which depend on the shared generic components Despite these we want to keep the core generic demangling library identical between both copies to simplify development and testing If you re working on the generic library
Definition: README.txt:30
StringView::front
char front() const
Definition: StringView.h:67
instruction
Since we know that Vector is byte aligned and we know the element offset of we should change the load into a lve *x instruction
Definition: README_ALTIVEC.txt:37
lowered
into xmm2 addss xmm2 xmm1 xmm3 addss xmm3 movaps xmm0 unpcklps xmm0 ret seems silly when it could just be one addps Expand libm rounding functions main should enable SSE DAZ mode and other fast SSE modes Think about doing i64 math in SSE regs on x86 This testcase should have no SSE instructions in and only one load from a constant double ret double C the select is being lowered
Definition: README-SSE.txt:89
magic
static ms magic(const APInt &d)
Calculate the magic numbers required to implement a signed integer division by a constant as a sequen...
Definition: TargetLowering.cpp:5192
verify
ppc ctr loops verify
Definition: PPCCTRLoops.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
trunc
We have fiadd patterns now but the followings have the same cost and complexity We need a way to specify the later is more profitable def def The FP stackifier should handle simple permutates to reduce number of shuffle e g trunc
Definition: README-FPStack.txt:63
v2
llvm lib Support Unix the directory structure underneath this directory could look like only those directories actually needing to be created should be created further subdirectories could be created to reflect versions of the various standards For under SUS there could be v2
Definition: README.txt:15
bb7
< i1 > br i1 label bb7
Definition: README.txt:976
LBB1_3
For the entry BB esp pxor xmm0 xmm1 ucomisd xmm1 setnp al sete cl testb al jne LBB1_5 xmm2 cvtss2sd xmm3 ucomisd xmm0 ja LBB1_3 xmm2 LBB1_3
Definition: README.txt:521
Consider
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably Consider
Definition: README.txt:121
it
Instead of the following for memset char edx edx edx It might be better to generate eax movl edx movl edx movw edx when we can spare a register It reduces code size Evaluate what the best way to codegen sdiv C is For we currently get ret i32 Y eax movl ecx ecx ecx addl eax eax ret GCC knows several different ways to codegen it
Definition: README.txt:151
llvm::format
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
xor
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 xor
Definition: README.txt:1271
tmp254
< i32 > tmp254
Definition: README.txt:559
for
this could be done in SelectionDAGISel along with other special for
Definition: README.txt:104
pc
add sub stmia L5 ldr pc
Definition: README.txt:201
sarl
Instead of the following for memset char edx edx edx It might be better to generate eax movl edx movl edx movw edx when we can spare a register It reduces code size Evaluate what the best way to codegen sdiv C is For we currently get ret i32 Y eax movl ecx sarl
Definition: README.txt:145
addq
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq rsp movq rsp movq rsp movq rsp movq rsp rax movq rsp rax movq rsp rsp rsp eax eax jbe LBB1_3 rcx rax movq rsp eax addq
Definition: README.txt:1143
not
Should compile r2 movcc movcs str strb mov lr r1 movcs movcc mov lr not
Definition: README.txt:465
well
llvm ldr ldrb ldrh str strh strb strb gcc and possibly speed as well(we don 't have a good way to measure on ARM). *Consider this silly example
Definition: README.txt:138
t1
<%struct.bf ** > define void t1() nounwind ssp
Definition: README.txt:1497
rewrite
static bool rewrite(Function &F)
Definition: PoisonChecking.cpp:261
llvm::pdb::Single
@ Single
Definition: PDBTypes.h:400
sp
_bar sp
Definition: README.txt:151
movswl
< i32 > br label bb114 eax ecx movl ebp subl ebp movswl(%ebp)
CFG
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to CFG
Definition: README.txt:39
Testcase
the custom lowered code happens to be but we shouldn t have to custom lower anything This is probably related to< 2 x i64 > ops being so bad LLVM currently generates stack realignment when it is not necessary needed The problem is that we need to know about stack alignment too before RA runs At that point we don t whether there will be vector or not Stack realignment logic is overly conservative but otherwise we can produce unaligned loads stores Fixing this will require some huge RA changes Testcase
Definition: README-SSE.txt:498
r3
Common register allocation spilling lr str ldr sxth r3
Definition: README.txt:8
differences
Itanium Name Demangler i e convert the string _Z1fv into but neither can depend on each other libcxxabi needs the demangler to implement which is part of the itanium ABI spec LLVM needs a copy for a bunch of but doesn t want to use the system s __cxa_demangle because it a might not be and b probably isn t that up to date on the latest language features The copy of the demangler in LLVM has some extra stuff that aren t needed in which depend on the shared generic components Despite these differences
Definition: README.txt:26
DemangleConfig.h
b4
int b4
Definition: README.txt:84
leaq
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq rsp movq rsp movq rsp movq rsp movq rsp leaq(%rsp)
better
a mov and lsr and lsl orr lsr orr lsl orr r1 bx lr Something like the following would be better(fewer instructions/registers)
Definition: README.txt:623
possible
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements possible
Definition: README.txt:410
name
static const char * name
Definition: SVEIntrinsicOpts.cpp:78
j
return j(j<< 16)
llvm::PPCISD::FSEL
@ FSEL
FSEL - Traditional three-operand fsel node.
Definition: PPCISelLowering.h:52
reassoc
into llvm powi allowing the code generator to produce balanced multiplication trees the intrinsic needs to be extended to support and second the code generator needs to be enhanced to lower these to multiplication trees Interesting testcase for add shift mul reassoc
Definition: README.txt:61
llvm::NVPTX::PTXLdStInstCode::V2
@ V2
Definition: NVPTX.h:123
operator==
bool operator==(const StringView &LHS, const StringView &RHS)
Definition: StringView.h:112
__cxa_demangle
Itanium Name Demangler i e convert the string _Z1fv into but neither can depend on each other libcxxabi needs the demangler to implement __cxa_demangle
Definition: README.txt:19
llvm::empty
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:254
seriously
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More seriously
Definition: README.txt:400
tables
we compile this esp call L1 $pb L1 esp je LBB1_2 esp ret but is currently always computed in the entry block It would be better to sink the picbase computation down into the block for the as it is the only one that uses it This happens for a lot of code with early outs Another example is loads of which are usually emitted into the entry block on targets like x86 If not used in all paths through a they should be sunk into the ones that do In this whole function isel would also handle this Investigate lowering of sparse switch statements into perfect hash tables
Definition: README.txt:439
split
coro split
Definition: CoroSplit.cpp:2276
llvm::GraphProgram::Name
Name
Definition: GraphWriter.h:52
std
Definition: BitVector.h:838
bit
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional ldr LCPI1_0 ldr ldr tst movne lsr ldr LCPI1_1 and r0 bx lr it saves an instruction and a register It might be profitable to cse MOVi16 if there are lots of bit immediates with the same bottom half Robert Muth started working on an alternate jump table implementation that does not put the tables in line in the text This is more like the llvm default jump table implementation This might be useful sometime Several revisions of patches are on the mailing beginning while CMP sets them like a subtract Therefore to be able to use CMN for comparisons other than the Z bit
Definition: README.txt:584
StringView::back
char back() const
Definition: StringView.h:72
rotate
The same transformation can work with an even modulo with the addition of a rotate
Definition: README.txt:680
can
This might compile to this xmm1 xorps xmm0 movss xmm0 ret Now consider if the code caused xmm1 to get spilled This might produce this xmm1 movaps xmm0 movaps xmm1 movss xmm0 ret since the reload is only used by these we could fold it into the producing something like xmm1 movaps xmm0 ret saving two instructions The basic idea is that a reload from a spill can
Definition: README-SSE.txt:269
or
compiles or
Definition: README.txt:606
get
Should compile to something r4 addze r3 instead we get
Definition: README.txt:24
reg1037
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 reg1037
Definition: README.txt:35
f1
We f1
Definition: README.txt:76
v4i16
def v4i16(MMX_MOVDQ2Qrr VR128:$src))>
DEMANGLE_NAMESPACE_BEGIN
#define DEMANGLE_NAMESPACE_BEGIN
Definition: DemangleConfig.h:89
armv5
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional ldr LCPI1_0 ldr ldr tst movne lsr ldr LCPI1_1 and r0 bx lr it saves an instruction and a register It might be profitable to cse MOVi16 if there are lots of bit immediates with the same bottom half Robert Muth started working on an alternate jump table implementation that does not put the tables in line in the text This is more like the llvm default jump table implementation This might be useful sometime Several revisions of patches are on the mailing beginning while CMP sets them like a subtract Therefore to be able to use CMN for comparisons other than the Z we ll need additional logic to reverse the conditionals associated with the comparison Perhaps a pseudo instruction for the with a post codegen pass to clean up and handle the condition codes See PR5694 for testcase Given the following on armv5
Definition: README.txt:592
v1
llvm lib Support Unix the directory structure underneath this directory could look like only those directories actually needing to be created should be created further subdirectories could be created to reflect versions of the various standards For under SUS there could be v1
Definition: README.txt:15
llvm::support::endian::read
value_type read(const void *memory, endianness endian)
Read a value of a particular endianness from memory.
Definition: Endian.h:63
addi
Function< 16 x i8 > Produces the following code with LCPI0_0 toc ha LCPI0_1 toc ha addi
Definition: README_ALTIVEC.txt:233
examples
vperm to rotate result into correct slot vsel result together Should codegen branches on vec_any vec_all to avoid mfcr Two examples
Definition: README_ALTIVEC.txt:190
reuse
loop Interchanges loops for cache reuse
Definition: LoopInterchange.cpp:1808
tmp245246
< i32 > tmp245246
Definition: README.txt:557
td
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq rsp movq rsp movq rsp movq rsp movq rsp rax movq rsp rax movq rsp rsp rsp eax eax jbe LBB1_3 rcx rax movq rsp eax rsp ret ecx eax rcx movl rsp jmp LBB1_2 gcc rsp rax movq rsp rsp movq rsp rax movq rsp eax eax jb L6 rdx eax rsp ret p2align edx rdx eax movl rsp eax rsp ret and it gets compiled into this on ebp esp eax movl ebp eax movl ebp eax esp popl ebp ret gcc ebp eax popl ebp ret Teach tblgen not to check bitconvert source type in some cases This allows us to consolidate the following patterns in X86InstrMMX td
Definition: README.txt:1204
give
therefore end up llgh r3 lr r0 br r14 but truncating the load would give
Definition: README.txt:91
options
The object format emitted by the WebAssembly backed is documented see the home and packaging for producing WebAssembly applications that can run in browsers and other environments wasi sdk provides a more minimal C C SDK based on llvm and a libc based on for producing WebAssemmbly applictions that use the WASI ABI Rust provides WebAssembly support integrated into Cargo There are two main options
Definition: README.txt:24
LowerSELECT_CC
static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG, const SparcTargetLowering &TLI, bool hasHardQuad)
Definition: SparcISelLowering.cpp:2473
llvm::MCID::Branch
@ Branch
Definition: MCInstrDesc.h:156
_test
float space text globl _test align _test
Definition: README_ALTIVEC.txt:118
improvement
is pessimized by loop reduce and indvars u32 to float conversion improvement
Definition: README.txt:291
case
we compile this esp call L1 $pb L1 esp je LBB1_2 esp ret but is currently always computed in the entry block It would be better to sink the picbase computation down into the block for the as it is the only one that uses it This happens for a lot of code with early outs Another example is loads of which are usually emitted into the entry block on targets like x86 If not used in all paths through a they should be sunk into the ones that do In this case
Definition: README.txt:429
argpromotion
argpromotion
Definition: ArgumentPromotion.cpp:1099
testl
this lets us change the cmpl into a testl
Definition: README.txt:967
llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1488
llvm::TargetStackID::Value
Value
Definition: TargetFrameLowering.h:27
main
Here we need to push the arguments because they overwrite each other main()
Definition: README.txt:718
StringView::dropBack
StringView dropBack(size_t N=1) const
Definition: StringView.h:61
x
TODO unsigned x
Definition: README.txt:10
globals
name anon globals
Definition: NameAnonGlobals.cpp:113
ppc
Unrolling by would eliminate the &in both leading to a net reduction in code size The resultant code would then also be suitable for exit value computation We miss a bunch of rotate opportunities on various including ppc
Definition: README.txt:567
arithmetic
Analysis the ScalarEvolution expression for r is< loop > Outside the this could be evaluated simply however ScalarEvolution currently evaluates it it involves i65 arithmetic
Definition: README.txt:15
slots
Merge disjoint stack slots
Definition: StackColoring.cpp:557
powi
This is blocked on not handling X *X *X powi(X, 3)(see note above). The issue is that we end up getting t
y
into llvm powi allowing the code generator to produce balanced multiplication trees the intrinsic needs to be extended to support and second the code generator needs to be enhanced to lower these to multiplication trees Interesting testcase for add shift mul int y
Definition: README.txt:61
llvm::log2
static double log2(double V)
Definition: AMDGPULibCalls.cpp:842
throughput
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium both variants have the same characteristics with regard to throughput
Definition: README.txt:253
tmp233
< i32 > tmp233
Definition: README.txt:556
blr
cond_true lis lo16() lo16() lo16() f1 fsel f3 blr
Definition: README.txt:108
llvm::SyncScope::System
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:58
require
static void require(bool Assertion, const char *Msg, llvm::StringRef Code)
Definition: Annotations.cpp:19
v3
we get the following basic r4 lvx v3
Definition: README_ALTIVEC.txt:95
than
So that lo16() r2 stb r3 blr Becomes r3 they should compile to something better than
Definition: README.txt:161
transform
instcombine should handle this transform
Definition: README.txt:262
flds
esp eax movl ecx ecx cvtsi2ss xmm0 eax cvtsi2ss xmm1 xmm0 addss xmm0 movss flds(%esp, 1) 0000002d addl $0x04
However
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction However
Definition: README.txt:43
llvm::MCID::Add
@ Add
Definition: MCInstrDesc.h:183
llvm::ISD::STORE
@ STORE
Definition: ISDOpcodes.h:922
bic
We currently LCPI0_0 and r2 ldr LCPI0_1 and r2 orr r0 bx lr We should be able to replace the second ldr and with a bic(i.e. reuse the constant which was already loaded). Not sure what 's necessary to do that. The code generated for bswap on armv4/5(CPUs without rev) is less than ideal
Definition: README.txt:604
StringView::begin
const char * begin() const
Definition: StringView.h:106
andb
bar al andb
Definition: README.txt:1401
v4
we get the following basic r4 lvx r3 vcmpeqfp v4
Definition: README_ALTIVEC.txt:96
arguments
we compile this esp call L1 $pb L1 esp je LBB1_2 esp ret but is currently always computed in the entry block It would be better to sink the picbase computation down into the block for the as it is the only one that uses it This happens for a lot of code with early outs Another example is loads of arguments
Definition: README.txt:425
horrible
into eax xorps xmm0 xmm0 eax xmm0 eax xmm0 ret esp eax movdqa xmm0 xmm0 esp const ret align it should be movdqa xmm0 xmm0 We should transform a shuffle of two vectors of constants into a single vector of constants insertelement of a constant into a vector of constants should also result in a vector of constants e g VecISelBug ll We compiled it to something horrible
Definition: README-SSE.txt:672
shifts
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra but it s one instruction and doesn t stress bit subreg eax eax movl edx edx sall eax sall cl edx bit shifts(in general) expand to really bad code. Instead of using cmovs
never
#define never
Definition: regcomp.c:286
v6
Should compile to use and accumulates this with a bit value We currently get this with both v4 and v6
Definition: README.txt:431
This
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical This
Definition: README.txt:418
commutative
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra but it s one instruction and doesn t stress bit subreg eax eax movl edx edx sall eax sall cl edx bit we should expand to a conditional branch like GCC produces Some isel and Sequencing of Instructions Scheduling for reduced register pressure E g Minimum Register Instruction Sequence load p Because the compare isn t commutative
Definition: README.txt:77
g0
To do *Keep the address of the constant pool in a register instead of forming its address all of the time *We can fold small constant offsets into the hi lo references to constant pool addresses as well *When in V9 register allocate icc *[0-3] Add support for isel ing UMUL_LOHI instead of marking it as Expand *Emit the Branch on Integer Register with Prediction instructions It s not clear how to write a pattern for this int o6 subcc l0 bne LBBt1_2 ! F nop l0 st g0
Definition: README.txt:34
X
instcombine should handle this C2 when X
Definition: README.txt:263
llvm::CannotBeNegativeZero
bool CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI, unsigned Depth=0)
Return true if we can prove that the specified FP value is never equal to -0.0.
Definition: ValueTracking.cpp:3440
movl
Instead of the following for memset char edx movl
Definition: README.txt:117
align
mov r0 ldr L5 sub r0 lr needed for prologue ldmia ip add bx lr r2 The last stmia stores r2 into the address passed in there is one additional stmia that stores and r2 to some stack location The store is dead The llvm gcc generated code looks like align
Definition: README.txt:236
llvm::numbers::phi
constexpr double phi
Definition: MathExtras.h:71
r5
_bar mov r0 mov r5
Definition: README.txt:154
s1
int s1
Definition: README.txt:182
Currently
into xmm2 addss xmm2 xmm1 xmm3 addss xmm3 movaps xmm0 unpcklps xmm0 ret seems silly when it could just be one addps Expand libm rounding functions main should enable SSE DAZ mode and other fast SSE modes Think about doing i64 math in SSE regs on x86 This testcase should have no SSE instructions in and only one load from a constant double ret double C Currently
Definition: README-SSE.txt:89
xmm4
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps xmm4
Definition: README.txt:1118
mode
We should investigate an instruction sinking pass Consider this silly example in pic mode
Definition: README.txt:400
used
This might compile to this xmm1 xorps xmm0 movss xmm0 ret Now consider if the code caused xmm1 to get spilled This might produce this xmm1 movaps xmm0 movaps xmm1 movss xmm0 ret since the reload is only used by these we could fold it into the producing something like xmm1 movaps xmm0 ret saving two instructions The basic idea is that a reload from a spill if only one byte chunk is used
Definition: README-SSE.txt:270
exit
declare void exit(i32) noreturn nounwind This compiles into
Definition: README.txt:1072
StringView::dropFront
StringView dropFront(size_t N=1) const
Definition: StringView.h:55
llvm::TPLoop::Allow
@ Allow
Definition: ARMTargetTransformInfo.h:53
looking
Should compile edi setae al movzbl eax ret on instead of the rather stupid looking
Definition: README.txt:543
N
#define N
mov
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and mov
Definition: README.txt:23
musl
The object format emitted by the WebAssembly backed is documented see the home and packaging for producing WebAssembly applications that can run in browsers and other environments wasi sdk provides a more minimal C C SDK based on llvm and a libc based on musl
Definition: README.txt:20
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
support
Reimplement select in terms of SEL *We would really like to support but we need to prove that the add doesn t need to overflow between the two bit chunks *Implement pre post increment support(e.g. PR935) *Implement smarter const ant generation for binops with large immediates. A few ARMv6T2 ops should be pattern matched
Definition: README.txt:10
shift
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra shift
Definition: README.txt:30
generate
We currently generate
Definition: README.txt:597
cmpb
< i32 > ret i32 tmp10 ecx cmpb
Definition: README.txt:992
stack
S is passed via registers r2 But gcc stores them to the stack
Definition: README.txt:189
StringView::empty
bool empty() const
Definition: StringView.h:109
PPC
should just be implemented with a CLZ instruction Since there are other e PPC
Definition: README.txt:709
first_one
The following code is currently eax eax ecx jb LBB1_2 eax movzbl first_one(%eax)
al
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra but it s one instruction and doesn t stress bit subreg eax eax movl edx edx sall eax sall cl edx bit we should expand to a conditional branch like GCC produces Some isel and Sequencing of Instructions Scheduling for reduced register pressure E g Minimum Register Instruction Sequence load p Because the compare isn t it is not matched with the load on both sides The dag combiner should be made smart enough to canonicalize the load into the RHS of a compare when it can invert the result of the compare for free In many LLVM generates code like eax cmpl esp setl al movzbl al
Definition: README.txt:89
neg
We currently generate a but we really shouldn eax ecx xorl edx divl ecx eax divl ecx movl eax ret A similar code sequence works for division We currently compile i32 v2 eax eax jo LBB1_2 neg
Definition: README.txt:1271
early
the custom lowered code happens to be but we shouldn t have to custom lower anything This is probably related to< 2 x i64 > ops being so bad LLVM currently generates stack realignment when it is not necessary needed The problem is that we need to know about stack alignment too early
Definition: README-SSE.txt:486
tmp231232
< i32 > tmp231232
Definition: README.txt:556
andl
esp eax movl ecx ecx cvtsi2ss xmm0 andl
Definition: README.txt:302
easy
For the entry BB esp pxor xmm0 xmm1 ucomisd xmm1 setnp al sete cl testb al jne LBB1_5 xmm2 cvtss2sd xmm3 ucomisd xmm0 ja LBB1_3 xmm2 xmm0 ret We should sink the load into xmm3 into the LBB1_2 block This should be pretty easy
Definition: README.txt:525
minimal
lower matrix intrinsics minimal
Definition: LowerMatrixIntrinsics.cpp:2360
operations
Should compile r2 movcc movcs str strb mov lr r1 movcs movcc mov lr r1 str mov mov cmp r1 movlo r2 str bx lr r0 mov mov cmp r0 movhs r2 mov r1 bx lr Some of the NEON intrinsics may be appropriate for more general either as target independent intrinsics or perhaps elsewhere in the ARM backend Some of them may also be lowered to target independent and perhaps some new SDNodes could be added For and absolute value operations are well defined and standard operations
Definition: README.txt:490
lowering
amdgpu printf runtime AMDGPU Printf lowering
Definition: AMDGPUPrintfRuntimeBinding.cpp:87
StringView
Definition: StringView.h:23
llvm::msgpack::Type
Type
MessagePack types as defined in the standard, with the exception of Integer being divided into a sign...
Definition: MsgPackReader.h:49
r12
llvm ldr ldrb r12
Definition: README.txt:127
tmp583584
< i8 * > tmp583584
Definition: README.txt:561
pred
hexagon gen pred
Definition: HexagonGenPredicate.cpp:134
instcombine
but this requires TBAA This isn t recognized as bswap by instcombine(yes, it really is bswap)
Definition: README.txt:191
page
The object format emitted by the WebAssembly backed is documented see the home page
Definition: README.txt:16
i0
To do *Keep the address of the constant pool in a register instead of forming its address all of the time *We can fold small constant offsets into the hi lo references to constant pool addresses as well *When in V9 register allocate icc *[0-3] Add support for isel ing UMUL_LOHI instead of marking it as Expand *Emit the Branch on Integer Register with Prediction instructions It s not clear how to write a pattern for this int o6 subcc i0
Definition: README.txt:27
tmp621622
< i32 > tmp621622
Definition: README.txt:564
addl
is currently compiled esp esp jne LBB1_1 addl
Definition: README.txt:397
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
f2
cond_true lis lo16() lo16() lo16() f1 fsel f2
Definition: README.txt:105
b0
int b0
problem
Common register allocation spilling problem
Definition: README.txt:5
tmp11
< i128 >< i128 > tmp11
Definition: README-SSE.txt:792
changes
test_demangle and llvm unittest Demangle The llvm directory should only get tests for stuff not included in the core library In the future we should probably move all the tests to LLVM It is also a really good idea to run libFuzzer after non trivial changes
Definition: README.txt:51
stmia
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a stmia
Definition: README.txt:204
nounwind
this lets us change the cmpl into a which is and eliminate the shift We compile this i32 i32 i8 zeroext d nounwind
Definition: README.txt:973
iPTR
gets compiled into this on rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movaps rsp movq rsp movq rsp movq rsp movq rsp movq rsp rax movq rsp rax movq rsp rsp rsp eax eax jbe LBB1_3 rcx rax movq rsp eax rsp ret ecx eax rcx movl rsp jmp LBB1_2 gcc rsp rax movq rsp rsp movq rsp rax movq rsp eax eax jb L6 rdx eax rsp ret p2align edx rdx eax movl rsp eax rsp ret and it gets compiled into this on ebp esp eax movl ebp eax movl ebp eax esp popl ebp ret gcc ebp eax popl ebp ret Teach tblgen not to check bitconvert source type in some cases This allows us to consolidate the following patterns in X86InstrMMX iPTR
Definition: README.txt:1205
From
BlockVerifier::State From
Definition: BlockVerifier.cpp:55
PR16157
This is blocked on not handling X *X *X which is the same number of multiplies and is because the *X has multiple uses Here s a simple X1 B ret i32 C Reassociate should handle the example in GCC PR16157
Definition: README.txt:84
larger
the resulting code requires compare and branches when and if the revised code is larger
Definition: README.txt:397
movsbl
we currently eax movsbl(%esp)
callee
Here we don t need to write any variables to the top of the stack since they don t overwrite each other int callee(int32 arg1, int32 arg2)
abort
*Add support for compiling functions in both ARM and Thumb then taking the smallest *Add support for compiling individual basic blocks in thumb when in a larger ARM function This can be used for presumed cold like paths to abort(failure path of asserts)
setlt
This would be a win on but not x86 or ppc64 setlt(loadi8 Phi)
FirstOnet
int FirstOnet(unsigned long long arg1)
Definition: README.txt:944
code
entry mr r2 blr This could be reduced to the much andc r2 r3 slwi or r2 rlwimi stw r3 blr We could collapse a bunch of those ORs and ANDs and generate the following equivalent code
Definition: README.txt:282
cycles
the multiplication has a latency of four cycles
Definition: README.txt:253
StringView::consumeFront
bool consumeFront(StringView S)
Definition: StringView.h:89
spill
the custom lowered code happens to be but we shouldn t have to custom lower anything This is probably related to< 2 x i64 > ops being so bad LLVM currently generates stack realignment when it is not necessary needed The problem is that we need to know about stack alignment too before RA runs At that point we don t whether there will be vector spill
Definition: README-SSE.txt:489
StringView::StringView
StringView(const char *First_, const char *Last_)
Definition: StringView.h:32
Alt
bool Alt
Definition: README.txt:468
d
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int int d
Definition: README.txt:418
r11
entry mflr r11 ***stw r11
Definition: README.txt:300
n
The same transformation can work with an even modulo with the addition of a and shrink the compare RHS by the same amount Unless the target supports that transformation probably isn t worthwhile The transformation can also easily be made to work with non zero equality for n
Definition: README.txt:685
optimization
bar al al movzbl eax ret Missed optimization
Definition: README.txt:1411
if
if(sum+x< x) z++
Definition: README.txt:176
combine
vector combine
Definition: VectorCombine.cpp:1110
registers
Implement PPCInstrInfo::isLoadFromStackSlot isStoreToStackSlot for vector registers
Definition: README_ALTIVEC.txt:4
model
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from where P can be anything The alignment inference code cannot handle loads from globals in static non mode because it doesn t look through the extra dyld stub load If you try vec_align ll without relocation model
Definition: README-SSE.txt:414
functions
aka conv or ret i32 or6 or even i depending on the speed of the multiplier The best way to handle this is to canonicalize it to a multiply in IR and have codegen handle lowering multiplies to shifts on cpus where shifts are faster We do a number of simplifications in simplify libcalls to strength reduce standard library functions
Definition: README.txt:631
branches
arc branch ARC finalize branches
Definition: ARCBranchFinalize.cpp:66
X86
Unrolling by would eliminate the &in both leading to a net reduction in code size The resultant code would then also be suitable for exit value computation We miss a bunch of rotate opportunities on various including etc On X86
Definition: README.txt:568
copy
we should consider alternate ways to model stack dependencies Lots of things could be done in WebAssemblyTargetTransformInfo cpp there are numerous optimization related hooks that can be overridden in WebAssemblyTargetLowering Instead of the OptimizeReturned which should consider preserving the returned attribute through to MachineInstrs and extending the MemIntrinsicResults pass to do this optimization on calls too That would also let the WebAssemblyPeephole pass clean up dead defs for such as it does for stores Consider implementing and or getMachineCombinerPatterns Find a clean way to fix the problem which leads to the Shrink Wrapping pass being run after the WebAssembly PEI pass When setting multiple variables to the same we currently get code like const It could be done with a smaller encoding like local tee $pop5 local copy
Definition: README.txt:101
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1284
emscripten
The object format emitted by the WebAssembly backed is documented see the home and packaging for producing WebAssembly applications that can run in browsers and other environments wasi sdk provides a more minimal C C SDK based on llvm and a libc based on for producing WebAssemmbly applictions that use the WASI ABI Rust provides WebAssembly support integrated into Cargo There are two main which provides a relatively minimal environment that has an emphasis on being native wasm32 unknown emscripten
Definition: README.txt:26
r2
llvm ldr r2
Definition: README.txt:126
_foo
So that _foo
Definition: README.txt:132
result
then result
Definition: README.txt:355
llvm::support::aligned
@ aligned
Definition: Endian.h:30
produce
bb420 i The CBE manages to produce
Definition: README.txt:49
libcall
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM ID Predecessors according to mbb< bb27, 0x8b0a7c0 > Note ADDri is not a two address instruction its result reg1037 is an operand of the PHI node in bb76 and its operand reg1039 is the result of the PHI node We should treat it as a two address code and make sure the ADDri is scheduled after any node that reads reg1039 Use info(i.e. register scavenger) to assign it a free register to allow reuse the collector could move the objects and invalidate the derived pointer This is bad enough in the first but safe points can crop up unpredictably **array_addr i32 n y store obj obj **nth_el If the i64 division is lowered to a libcall
Definition: README.txt:127
LBB0_2
< i32 > ret i32 tmp10 ecx esp je LBB0_2 eax addl eax ret LBB0_2
Definition: README.txt:999
reassociate
nary reassociate
Definition: NaryReassociate.cpp:162
entry
print Instructions which execute on loop entry
Definition: MustExecute.cpp:339
Also
llvm lib Support Unix the directory structure underneath this directory could look like only those directories actually needing to be created should be created Also
Definition: README.txt:14
INF
#define INF
llvm::support::big
@ big
Definition: Endian.h:27
faster
_test eax eax ret Leaf functions that require one byte spill slot have a prolog like esp and an epilog like esp popl esi ret It would be and potentially faster
Definition: README.txt:479
ebx
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra but it s one instruction and doesn t stress bit subreg eax eax movl edx edx sall eax sall cl edx bit we should expand to a conditional branch like GCC produces Some isel and Sequencing of Instructions Scheduling for reduced register pressure E g Minimum Register Instruction Sequence load p Because the compare isn t it is not matched with the load on both sides The dag combiner should be made smart enough to canonicalize the load into the RHS of a compare when it can invert the result of the compare for free In many LLVM generates code like eax cmpl esp setl al movzbl eax ret on some it is more efficient to do ebx xor eax cmpl ebx
Definition: README.txt:97
of
Add support for conditional and other related patterns Instead of
Definition: README.txt:134
__Z11no_overflowjj
Should compile r2 movcc movcs str strb mov lr r1 movcs movcc mov lr r1 str mov mov cmp r1 movlo r2 str bx lr __Z11no_overflowjj
Definition: README.txt:474
machine
coro Split coroutine into a set of functions driving its state machine
Definition: CoroSplit.cpp:2277
write
static void write(bool isBE, void *P, T V)
Definition: RuntimeDyldELF.cpp:37
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37