Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LLVM does a poor job of rematerializing address offsets which will fold into the addressing mode #22604

Open
llvmbot opened this issue Jan 14, 2015 · 7 comments
Labels
bugzilla Issues migrated from bugzilla llvm:codegen

Comments

@llvmbot
Copy link
Collaborator

llvmbot commented Jan 14, 2015

Bugzilla Link 22230
Version trunk
OS All
Reporter LLVM Bugzilla Contributor
CC @chandlerc,@compnerd,@majnemer,@echristo,@hfinkel,@qcolombet,@rnk

Extended Description

A certain code pattern, where a function mostly consists of a large loop around a switch or if-else condition results in large live ranges for basic address computations. The register allocator fails to rematerialize these address computations in addressing modes of memory operands and instead spills and reloads them from the stack dramatically increasing stack usage and generally causing badness.

Small reproduction:

struct A {
unsigned a;
unsigned b;
unsigned c;
unsigned d;
};

void assign(unsigned *val);

void f(unsigned char *input, A *a) {
for (;;) {
unsigned char t = (++input)[0];
if (t == 0) {
assign(&a->a);
} else if (t == 1) {
assign(&a->b);
} else if (t == 2) {
assign(&a->c);
} else if (t == 3) {
assign(&a->d);
}
}
}

Resulting in:

__Z1fPhP1A: ## @​_Z1fPhP1A
.cfi_startproc

BB#0: ## %entry

pushq	%rbp

Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $24, %rsp
Ltmp3:
.cfi_offset %rbx, -56
Ltmp4:
.cfi_offset %r12, -48
Ltmp5:
.cfi_offset %r13, -40
Ltmp6:
.cfi_offset %r14, -32
Ltmp7:
.cfi_offset %r15, -24
movq %rdi, %rbx
leaq 4(%rsi), %rax
movq %rax, -48(%rbp) ## 8-byte Spill
leaq 8(%rsi), %rax
movq %rax, -56(%rbp) ## 8-byte Spill
leaq 12(%rsi), %rax
movq %rax, -64(%rbp) ## 8-byte Spill
leaq 16(%rsi), %r14
leaq 20(%rsi), %r15
movq %rsi, %r13
incq %rbx
leaq LJTI0_0(%rip), %r12
jmp LBB0_1
[.....]

@rnk
Copy link
Collaborator

rnk commented Jan 14, 2015

We could also solve this in CodeGenPrep by sinking the GEPs into the blocks where they are used. Manually doing that IR transform and invoking llc gives me good code:

$ cat t.ll
; ModuleID = 't.cpp'
target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
target triple = "i686-pc-windows-msvc"

%struct.A = type { i32, i32, i32, i32 }

; Function Attrs: noreturn
define void @"\01?f@@YAXPAEPAUA@@@z"(i8* nocapture readonly %input, %struct.A* %a) #​0 {
entry:
br label %for.cond

for.cond: ; preds = %for.cond.backedge, %entry
%input.addr.0 = phi i8* [ %input, %entry ], [ %incdec.ptr, %for.cond.backedge ]
%incdec.ptr = getelementptr inbounds i8* %input.addr.0, i32 1
%0 = load i8* %incdec.ptr, align 1, !tbaa !​1
switch i8 %0, label %for.cond.backedge [
i8 0, label %if.then
i8 1, label %if.then4
i8 2, label %if.then8
i8 3, label %if.then12
]

if.then: ; preds = %for.cond
%a1 = getelementptr inbounds %struct.A* %a, i32 0, i32 0
tail call void @"\01?assign@@YAXPAI@Z"(i32* %a1)
br label %for.cond.backedge

if.then4: ; preds = %for.cond
%b = getelementptr inbounds %struct.A* %a, i32 0, i32 1
tail call void @"\01?assign@@YAXPAI@Z"(i32* %b)
br label %for.cond.backedge

for.cond.backedge: ; preds = %if.then4, %if.then12, %if.then8, %if.then, %for.cond
br label %for.cond

if.then8: ; preds = %for.cond
%c = getelementptr inbounds %struct.A* %a, i32 0, i32 2
tail call void @"\01?assign@@YAXPAI@Z"(i32* %c)
br label %for.cond.backedge

if.then12: ; preds = %for.cond
%d = getelementptr inbounds %struct.A* %a, i32 0, i32 3
tail call void @"\01?assign@@YAXPAI@Z"(i32* %d)
br label %for.cond.backedge
}

declare void @"\01?assign@@YAXPAI@Z"(i32*) #​1

attributes #​0 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #​1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.ident = !{#0}

!​0 = !{!"clang version 3.6.0 "}
!​1 = !{#2, !​2, i64 0}
!​2 = !{!"omnipotent char", !​3, i64 0}
!​3 = !{!"Simple C/C++ TBAA"}

$ llc t.ll -o -
.text
.def @​feat.00;
.scl 3;
.type 0;
.endef
.globl @​feat.00
@​feat.00 = 1
.def "?f@@YAXPAEPAUA@@@z";
.scl 2;
.type 32;
.endef
.globl "?f@@YAXPAEPAUA@@@z"
.align 16, 0x90
"?f@@YAXPAEPAUA@@@z": # @"\01?f@@YAXPAEPAUA@@@z"

BB#0: # %entry

    pushl   %ebp
    movl    %esp, %ebp
    pushl   %edi
    pushl   %esi
    pushl   %eax
    movl    12(%ebp), %esi
    movl    8(%ebp), %edi
    jmp     LBB0_1
    .align  16, 0x90

LBB0_5: # %for.cond.backedge
# in Loop: Header=BB0_1 Depth=1
movl %eax, (%esp)
calll "?assign@@YAXPAI@Z"
LBB0_1: # %for.cond
# =>This Inner Loop Header: Depth=1
incl %edi
movzbl (%edi), %eax
cmpl $3, %eax
ja LBB0_1

BB#2: # %for.cond

                                    #   in Loop: Header=BB0_1 Depth=1
    jmpl    *LJTI0_0(,%eax,4)

LBB0_3: # %if.then
# in Loop: Header=BB0_1 Depth=1
movl %esi, (%esp)
calll "?assign@@YAXPAI@Z"
jmp LBB0_1
LBB0_4: # %if.then4
# in Loop: Header=BB0_1 Depth=1
leal 4(%esi), %eax
jmp LBB0_5
LBB0_7: # %if.then8
# in Loop: Header=BB0_1 Depth=1
leal 8(%esi), %eax
jmp LBB0_5
LBB0_8: # %if.then12
# in Loop: Header=BB0_1 Depth=1
leal 12(%esi), %eax
jmp LBB0_5
.section .rdata,"rd"
.align 4
LJTI0_0:
.long LBB0_3
.long LBB0_4
.long LBB0_7
.long LBB0_8

@majnemer
Copy link
Mannequin

majnemer mannequin commented Jan 14, 2015

We could also solve this in CodeGenPrep by sinking the GEPs into the blocks
where they are used. Manually doing that IR transform and invoking llc gives
me good code:

$ cat t.ll
; ModuleID = 't.cpp'
target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
target triple = "i686-pc-windows-msvc"

%struct.A = type { i32, i32, i32, i32 }

; Function Attrs: noreturn
define void @"\01?f@@YAXPAEPAUA@@@z"(i8* nocapture readonly %input,
%struct.A* %a) #​0 {
entry:
br label %for.cond

for.cond: ; preds =
%for.cond.backedge, %entry
%input.addr.0 = phi i8* [ %input, %entry ], [ %incdec.ptr,
%for.cond.backedge ]
%incdec.ptr = getelementptr inbounds i8* %input.addr.0, i32 1
%0 = load i8* %incdec.ptr, align 1, !tbaa !​1
switch i8 %0, label %for.cond.backedge [
i8 0, label %if.then
i8 1, label %if.then4
i8 2, label %if.then8
i8 3, label %if.then12
]

if.then: ; preds = %for.cond
%a1 = getelementptr inbounds %struct.A* %a, i32 0, i32 0
tail call void @"\01?assign@@YAXPAI@Z"(i32* %a1)
br label %for.cond.backedge

if.then4: ; preds = %for.cond
%b = getelementptr inbounds %struct.A* %a, i32 0, i32 1
tail call void @"\01?assign@@YAXPAI@Z"(i32* %b)
br label %for.cond.backedge

for.cond.backedge: ; preds = %if.then4,
%if.then12, %if.then8, %if.then, %for.cond
br label %for.cond

if.then8: ; preds = %for.cond
%c = getelementptr inbounds %struct.A* %a, i32 0, i32 2
tail call void @"\01?assign@@YAXPAI@Z"(i32* %c)
br label %for.cond.backedge

if.then12: ; preds = %for.cond
%d = getelementptr inbounds %struct.A* %a, i32 0, i32 3
tail call void @"\01?assign@@YAXPAI@Z"(i32* %d)
br label %for.cond.backedge
}

declare void @"\01?assign@@YAXPAI@Z"(i32*) #​1

attributes #​0 = { noreturn "less-precise-fpmad"="false"
"no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false" "no-nans-fp-math"="false"
"stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
"use-soft-float"="false" }
attributes #​1 = { "less-precise-fpmad"="false"
"no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"
"no-infs-fp-math"="false" "no-nans-fp-math"="false"
"stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
"use-soft-float"="false" }

!llvm.ident = !{#0}

!​0 = !{!"clang version 3.6.0 "}
!​1 = !{#2, !​2, i64 0}
!​2 = !{!"omnipotent char", !​3, i64 0}
!​3 = !{!"Simple C/C++ TBAA"}

$ llc t.ll -o -
.text
.def @​feat.00;
.scl 3;
.type 0;
.endef
.globl @​feat.00
@​feat.00 = 1
.def "?f@@YAXPAEPAUA@@@z";
.scl 2;
.type 32;
.endef
.globl "?f@@YAXPAEPAUA@@@z"
.align 16, 0x90
"?f@@YAXPAEPAUA@@@z": # @"\01?f@@YAXPAEPAUA@@@z"

BB#0: # %entry

    pushl   %ebp
    movl    %esp, %ebp
    pushl   %edi
    pushl   %esi
    pushl   %eax
    movl    12(%ebp), %esi
    movl    8(%ebp), %edi
    jmp     LBB0_1
    .align  16, 0x90

LBB0_5: # %for.cond.backedge
# in Loop: Header=BB0_1 Depth=1
movl %eax, (%esp)
calll "?assign@@YAXPAI@Z"
LBB0_1: # %for.cond
# =>This Inner Loop Header: Depth=1
incl %edi
movzbl (%edi), %eax
cmpl $3, %eax
ja LBB0_1

BB#2: # %for.cond

                                    #   in Loop: Header=BB0_1 Depth=1
    jmpl    *LJTI0_0(,%eax,4)

LBB0_3: # %if.then
# in Loop: Header=BB0_1 Depth=1
movl %esi, (%esp)
calll "?assign@@YAXPAI@Z"
jmp LBB0_1
LBB0_4: # %if.then4
# in Loop: Header=BB0_1 Depth=1
leal 4(%esi), %eax
jmp LBB0_5
LBB0_7: # %if.then8
# in Loop: Header=BB0_1 Depth=1
leal 8(%esi), %eax
jmp LBB0_5
LBB0_8: # %if.then12
# in Loop: Header=BB0_1 Depth=1
leal 12(%esi), %eax
jmp LBB0_5
.section .rdata,"rd"
.align 4
LJTI0_0:
.long LBB0_3
.long LBB0_4
.long LBB0_7
.long LBB0_8

Sounds like a job for lazy code motion, if we had it.

@rnk
Copy link
Collaborator

rnk commented Jan 14, 2015

Sounds like a job for lazy code motion, if we had it.

I actually don't know what that is, but I mentioned this because I recall that the CUDA guys needed this and implemented it in a target-specific IR pass.

@qcolombet
Copy link
Collaborator

The CodeGenPrepare approach is definitely the way to go here. In fact, I wonder why it is not already doing what we want since we should have all the logic there to handle this.
Reid, would you mind having a look?

Just to give some information on why the rematerialization in register allocator wouldn’t help: We have to rematerialize LEAs. At best, LEAs use only one register and define one. I.e., unless the register we use is a reserved register like SP, we would still need to carry over the value of that register to the related uses (addressing mode or not). Thus that does not help the allocation problem and we still have to spill some value.
The only thing we would get in that case is a reduction in spill size on the stack as the value that feeds the LEAs is the same. It is generally not true. Moreover, it may not save spill instructions. Though in that case that would likely be the case.

To summarize handling that through rematerialization would mean we trade potential spill size improvement against potentially more expensive computation. Indeed, we trade a reload of the spilled LEA against a reload + LEA.
I.e.,
A = lea B
spill A, @​A
...
C = reload @​A
=>
spill B, @​B
...
D = reload @​B
C = lea D

As a side note, currently, the register allocator only handles “simple” rematerialization, i.e., instructions that do not have any uses like load of immediate.

@chandlerc
Copy link
Member

Relaying some IRC discussions here...

The CodeGenPrepare approach is definitely the way to go here. In fact, I
wonder why it is not already doing what we want since we should have all the
logic there to handle this.
Reid, would you mind having a look?

So, CGP doesn't work well for a more interesting reason.

CGP correctly handles the case where we have a load or store that we can fold the GEP into an addressing mode for. That's not the case we have.

We have an address operand to a call. The only reason it makes sense to not spill these is that computing the values and putting them in registers with LEA takes exactly the same resources as loading them from the stack even if the load is free (same addressing unit usage).

So this really seems like a case where we have an operation that can produce a value which is as cheap or cheaper than reloading that value, with no 'folding' or other factors. The only way this makes sense to do on each loop iteration is because we ran out of registers to hold the precomputed values. So I think this really needs to be more analogous to remat.

Just to give some information on why the rematerialization in register
allocator wouldn’t help: We have to rematerialize LEAs. At best, LEAs use
only one register and define one. I.e., unless the register we use is a
reserved register like SP, we would still need to carry over the value of
that register to the related uses (addressing mode or not). Thus that does
not help the allocation problem and we still have to spill some value.
The only thing we would get in that case is a reduction in spill size on the
stack as the value that feeds the LEAs is the same. It is generally not
true. Moreover, it may not save spill instructions. Though in that case that
would likely be the case.

To summarize handling that through rematerialization would mean we trade
potential spill size improvement against potentially more expensive
computation. Indeed, we trade a reload of the spilled LEA against a reload +
LEA.
I.e.,
A = lea B
spill A, @​A
...
C = reload @​A
=>
spill B, @​B
...
D = reload @​B
C = lea D

This is great example. I think there are two key reasons why this makes sense:

  1. we can remat all the uses of this vreg.
  2. there are already other users of the "base" vreg.

Put another way, while if there is only one of these in the loop then we would trade a reload of the spilled LEA against a reload + LEA, when there are 2 entries in the loop, we trade 2 reloads against 1 reload + 2 LEAs. that makes the regalloc problem simpler in a meaningful way I think. I'm not sure if "2" or "3" should be the magical number, we can use a reasonable cutoff here, but essentially at some point reducing the number of reloads should be worth the addititional operations.

This is then especially true of an LEA which uses no more of the addressing unit than a reload would (base + offset) as that instruction seems strictly no more expensive than the reload itself.

Does that make more sense to you Quentin?

As a side note, currently, the register allocator only handles “simple”
rematerialization, i.e., instructions that do not have any uses like load of
immediate.

@rnk
Copy link
Collaborator

rnk commented Jan 29, 2015

I wrote a comment here this morning something along the lines of what Chandler said. Basically, I agree it's a remat problem because we don't have a memory access to fold into here.

@qcolombet
Copy link
Collaborator

Relaying some IRC discussions here…

Thanks Chandler for the update here!

CGP correctly handles the case where we have a load or store that we can
fold the GEP into an addressing mode for. That's not the case we have.

Ah right, I should have looked into the IR and spare the investigation on that aspect. CodeGenPrepare does, indeed, this folding for addressing modes only.

We have an address operand to a call. The only reason it makes sense to not
spill these is that computing the values and putting them in registers with
LEA takes exactly the same resources as loading them from the stack even if
the load is free (same addressing unit usage).

I am not sure I follow here. In that case the LEA and the reload do not take the same resources, at least from the register allocator point of view. The LEA still needs a non-free input register (rsi), whereas the reload does not (rbp).

So this really seems like a case where we have an operation that can produce
a value which is as cheap or cheaper than reloading that value, with no
'folding' or other factors.

Well, you still need the input value of the LEA to be available, which may not be the case where you want to rematerialize.

The only way this makes sense to do on each loop
iteration is because we ran out of registers to hold the precomputed values.
So I think this really needs to be more analogous to remat.

I.e.,
A = lea B
spill A, @​A
...
C = reload @​A
=>
spill B, @​B
...
D = reload @​B
C = lea D

This is great example. I think there are two key reasons why this makes
sense:

  1. we can remat all the uses of this vreg.

Wait, I guess the code feeding the register allocator something like following:
vreg1 = lea base + 1
vreg2 = lea base + 2
vreg3 = lea base + 3

loop:
switch 1, 2, 3
bb1:
= vreg1
br end
bb2:
= vreg2
br end
bb3:
= vreg3
br end
end:
[exceeded register pressure]
back to loop

I.e., the uses of the base are fine, this is the uses of its users that are not.

  1. there are already other users of the "base" greg.

But those are all before the point where we exceed the register pressure.

Anyway, the way the register allocator works is that it tries to split the live-ranges to relax the constraints then spills if it did not work. When you get to the spill point, you try to produce the shortest possible live-ranges. Thus, you would insert the reload right before the uses.
However, here to see the benefits of reloading ‘base' would imply to place the reload just before the switch and see that it is cheaper to produce the related LEAs in each case of the switch.

There are several problems here:

  1. We need to know that vreg1, vreg2, and vreg3 share the same computation.
  2. We need to know that producing vreg1, vreg2, and vreg3 from #​1 is cheap and do not need more resources.
  3. We need to find a place where we can share the reload of the base.
  4. We need to know that the place from #​3 helps the register allocation problem.
  5. Right now, the live ranges are considered one by one.

For #​2, we may be able to rely on some existing tablegen annotation, like isCheapAsMove but I wonder if that would be enough.
#​3 and #​4 may simply fall from the current splitting mechanism.

#​5 is a particular fundamental change for the allocator and anyway, I do not think it should go in that direction (at least with our current algorithm).
#​1 does not seem to be rematerialization specific but instead more code motion like.

My point is that it really looks like we would need some kind of global code motion here and in particular, I do not think this should be handled as rematerialization.

Let me go back to the example to illustrate why.
Assume we perform a register pressure aware code motion, scheduling, ..., name it. We could transform the code like this:
loop:
switch 1, 2, 3
bb1:
vreg1 = lea base + 1
= vreg1
br end
bb2:
vreg2 = lea base + 2
= vreg2
br end
bb3:
vreg3 = lea base + 3
= vreg3
br end
end:
[exceeded register pressure] <— only base is alive now.
back to loop

Because of this change, the register pressure problem may just vanish.

Does that make more sense to you Quentin?

In my opinion, this is a code motion/global scheduling problem, not a rematerialization problem. Moreover, I do not think the register allocator, as it exists, is the right place to fix this.

Just to clarify, my definitions for general rematerialization and code motion are:

  • General rematerialization is the ability to recompute a value from other values that are available.
  • Code motion moves instructions around based on some objective functions.
    In other words, remarterializing vregX from base would require that base is available, which is not the case. Code motion does not have such restrictions.

Arguably, with some stretch of the definition of rematerialization we end up with a code motion definition. However, going into that direction does not seem to fit the design of the current register allocator.

The bottom line is for a long-term solution, I think it would be beneficial to work toward a global scheduler with register pressure awareness (or a register allocator with scheduling awareness) than to fix that in the current register allocator.
For short-term, we may be able to handle this case in the register allocator with a new splitting heuristic, not a rematerialization thing (rematerialization is too late for that IMHO). That sounds challenging and hence interesting to look into :).

How important is that use case?

@llvmbot llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 9, 2021
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bugzilla Issues migrated from bugzilla llvm:codegen
Projects
None yet
Development

No branches or pull requests

4 participants