A certain code pattern, where a function mostly consists of a large loop around a switch or if-else condition results in large live ranges for basic address computations. The register allocator fails to rematerialize these address computations in addressing modes of memory operands and instead spills and reloads them from the stack dramatically increasing stack usage and generally causing badness. Small reproduction: struct A { unsigned a; unsigned b; unsigned c; unsigned d; }; void assign(unsigned *val); void f(unsigned char *input, A *a) { for (;;) { unsigned char t = (++input)[0]; if (t == 0) { assign(&a->a); } else if (t == 1) { assign(&a->b); } else if (t == 2) { assign(&a->c); } else if (t == 3) { assign(&a->d); } } } Resulting in: __Z1fPhP1A: ## @_Z1fPhP1A .cfi_startproc ## BB#0: ## %entry pushq %rbp Ltmp0: .cfi_def_cfa_offset 16 Ltmp1: .cfi_offset %rbp, -16 movq %rsp, %rbp Ltmp2: .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbx subq $24, %rsp Ltmp3: .cfi_offset %rbx, -56 Ltmp4: .cfi_offset %r12, -48 Ltmp5: .cfi_offset %r13, -40 Ltmp6: .cfi_offset %r14, -32 Ltmp7: .cfi_offset %r15, -24 movq %rdi, %rbx leaq 4(%rsi), %rax movq %rax, -48(%rbp) ## 8-byte Spill leaq 8(%rsi), %rax movq %rax, -56(%rbp) ## 8-byte Spill leaq 12(%rsi), %rax movq %rax, -64(%rbp) ## 8-byte Spill leaq 16(%rsi), %r14 leaq 20(%rsi), %r15 movq %rsi, %r13 incq %rbx leaq LJTI0_0(%rip), %r12 jmp LBB0_1 [.....]
We could also solve this in CodeGenPrep by sinking the GEPs into the blocks where they are used. Manually doing that IR transform and invoking llc gives me good code: $ cat t.ll ; ModuleID = 't.cpp' target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" target triple = "i686-pc-windows-msvc" %struct.A = type { i32, i32, i32, i32 } ; Function Attrs: noreturn define void @"\01?f@@YAXPAEPAUA@@@Z"(i8* nocapture readonly %input, %struct.A* %a) #0 { entry: br label %for.cond for.cond: ; preds = %for.cond.backedge, %entry %input.addr.0 = phi i8* [ %input, %entry ], [ %incdec.ptr, %for.cond.backedge ] %incdec.ptr = getelementptr inbounds i8* %input.addr.0, i32 1 %0 = load i8* %incdec.ptr, align 1, !tbaa !1 switch i8 %0, label %for.cond.backedge [ i8 0, label %if.then i8 1, label %if.then4 i8 2, label %if.then8 i8 3, label %if.then12 ] if.then: ; preds = %for.cond %a1 = getelementptr inbounds %struct.A* %a, i32 0, i32 0 tail call void @"\01?assign@@YAXPAI@Z"(i32* %a1) br label %for.cond.backedge if.then4: ; preds = %for.cond %b = getelementptr inbounds %struct.A* %a, i32 0, i32 1 tail call void @"\01?assign@@YAXPAI@Z"(i32* %b) br label %for.cond.backedge for.cond.backedge: ; preds = %if.then4, %if.then12, %if.then8, %if.then, %for.cond br label %for.cond if.then8: ; preds = %for.cond %c = getelementptr inbounds %struct.A* %a, i32 0, i32 2 tail call void @"\01?assign@@YAXPAI@Z"(i32* %c) br label %for.cond.backedge if.then12: ; preds = %for.cond %d = getelementptr inbounds %struct.A* %a, i32 0, i32 3 tail call void @"\01?assign@@YAXPAI@Z"(i32* %d) br label %for.cond.backedge } declare void @"\01?assign@@YAXPAI@Z"(i32*) #1 attributes #0 = { noreturn "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } !llvm.ident = !{!0} !0 = !{!"clang version 3.6.0 "} !1 = !{!2, !2, i64 0} !2 = !{!"omnipotent char", !3, i64 0} !3 = !{!"Simple C/C++ TBAA"} $ llc t.ll -o - .text .def @feat.00; .scl 3; .type 0; .endef .globl @feat.00 @feat.00 = 1 .def "?f@@YAXPAEPAUA@@@Z"; .scl 2; .type 32; .endef .globl "?f@@YAXPAEPAUA@@@Z" .align 16, 0x90 "?f@@YAXPAEPAUA@@@Z": # @"\01?f@@YAXPAEPAUA@@@Z" # BB#0: # %entry pushl %ebp movl %esp, %ebp pushl %edi pushl %esi pushl %eax movl 12(%ebp), %esi movl 8(%ebp), %edi jmp LBB0_1 .align 16, 0x90 LBB0_5: # %for.cond.backedge # in Loop: Header=BB0_1 Depth=1 movl %eax, (%esp) calll "?assign@@YAXPAI@Z" LBB0_1: # %for.cond # =>This Inner Loop Header: Depth=1 incl %edi movzbl (%edi), %eax cmpl $3, %eax ja LBB0_1 # BB#2: # %for.cond # in Loop: Header=BB0_1 Depth=1 jmpl *LJTI0_0(,%eax,4) LBB0_3: # %if.then # in Loop: Header=BB0_1 Depth=1 movl %esi, (%esp) calll "?assign@@YAXPAI@Z" jmp LBB0_1 LBB0_4: # %if.then4 # in Loop: Header=BB0_1 Depth=1 leal 4(%esi), %eax jmp LBB0_5 LBB0_7: # %if.then8 # in Loop: Header=BB0_1 Depth=1 leal 8(%esi), %eax jmp LBB0_5 LBB0_8: # %if.then12 # in Loop: Header=BB0_1 Depth=1 leal 12(%esi), %eax jmp LBB0_5 .section .rdata,"rd" .align 4 LJTI0_0: .long LBB0_3 .long LBB0_4 .long LBB0_7 .long LBB0_8
(In reply to comment #1) > We could also solve this in CodeGenPrep by sinking the GEPs into the blocks > where they are used. Manually doing that IR transform and invoking llc gives > me good code: > > $ cat t.ll > ; ModuleID = 't.cpp' > target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" > target triple = "i686-pc-windows-msvc" > > %struct.A = type { i32, i32, i32, i32 } > > ; Function Attrs: noreturn > define void @"\01?f@@YAXPAEPAUA@@@Z"(i8* nocapture readonly %input, > %struct.A* %a) #0 { > entry: > br label %for.cond > > for.cond: ; preds = > %for.cond.backedge, %entry > %input.addr.0 = phi i8* [ %input, %entry ], [ %incdec.ptr, > %for.cond.backedge ] > %incdec.ptr = getelementptr inbounds i8* %input.addr.0, i32 1 > %0 = load i8* %incdec.ptr, align 1, !tbaa !1 > switch i8 %0, label %for.cond.backedge [ > i8 0, label %if.then > i8 1, label %if.then4 > i8 2, label %if.then8 > i8 3, label %if.then12 > ] > > if.then: ; preds = %for.cond > %a1 = getelementptr inbounds %struct.A* %a, i32 0, i32 0 > tail call void @"\01?assign@@YAXPAI@Z"(i32* %a1) > br label %for.cond.backedge > > if.then4: ; preds = %for.cond > %b = getelementptr inbounds %struct.A* %a, i32 0, i32 1 > tail call void @"\01?assign@@YAXPAI@Z"(i32* %b) > br label %for.cond.backedge > > for.cond.backedge: ; preds = %if.then4, > %if.then12, %if.then8, %if.then, %for.cond > br label %for.cond > > if.then8: ; preds = %for.cond > %c = getelementptr inbounds %struct.A* %a, i32 0, i32 2 > tail call void @"\01?assign@@YAXPAI@Z"(i32* %c) > br label %for.cond.backedge > > if.then12: ; preds = %for.cond > %d = getelementptr inbounds %struct.A* %a, i32 0, i32 3 > tail call void @"\01?assign@@YAXPAI@Z"(i32* %d) > br label %for.cond.backedge > } > > declare void @"\01?assign@@YAXPAI@Z"(i32*) #1 > > attributes #0 = { noreturn "less-precise-fpmad"="false" > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" > "no-infs-fp-math"="false" "no-nans-fp-math"="false" > "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" > "use-soft-float"="false" } > attributes #1 = { "less-precise-fpmad"="false" > "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" > "no-infs-fp-math"="false" "no-nans-fp-math"="false" > "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" > "use-soft-float"="false" } > > !llvm.ident = !{!0} > > !0 = !{!"clang version 3.6.0 "} > !1 = !{!2, !2, i64 0} > !2 = !{!"omnipotent char", !3, i64 0} > !3 = !{!"Simple C/C++ TBAA"} > > $ llc t.ll -o - > .text > .def @feat.00; > .scl 3; > .type 0; > .endef > .globl @feat.00 > @feat.00 = 1 > .def "?f@@YAXPAEPAUA@@@Z"; > .scl 2; > .type 32; > .endef > .globl "?f@@YAXPAEPAUA@@@Z" > .align 16, 0x90 > "?f@@YAXPAEPAUA@@@Z": # @"\01?f@@YAXPAEPAUA@@@Z" > # BB#0: # %entry > pushl %ebp > movl %esp, %ebp > pushl %edi > pushl %esi > pushl %eax > movl 12(%ebp), %esi > movl 8(%ebp), %edi > jmp LBB0_1 > .align 16, 0x90 > LBB0_5: # %for.cond.backedge > # in Loop: Header=BB0_1 Depth=1 > movl %eax, (%esp) > calll "?assign@@YAXPAI@Z" > LBB0_1: # %for.cond > # =>This Inner Loop Header: Depth=1 > incl %edi > movzbl (%edi), %eax > cmpl $3, %eax > ja LBB0_1 > # BB#2: # %for.cond > # in Loop: Header=BB0_1 Depth=1 > jmpl *LJTI0_0(,%eax,4) > LBB0_3: # %if.then > # in Loop: Header=BB0_1 Depth=1 > movl %esi, (%esp) > calll "?assign@@YAXPAI@Z" > jmp LBB0_1 > LBB0_4: # %if.then4 > # in Loop: Header=BB0_1 Depth=1 > leal 4(%esi), %eax > jmp LBB0_5 > LBB0_7: # %if.then8 > # in Loop: Header=BB0_1 Depth=1 > leal 8(%esi), %eax > jmp LBB0_5 > LBB0_8: # %if.then12 > # in Loop: Header=BB0_1 Depth=1 > leal 12(%esi), %eax > jmp LBB0_5 > .section .rdata,"rd" > .align 4 > LJTI0_0: > .long LBB0_3 > .long LBB0_4 > .long LBB0_7 > .long LBB0_8 Sounds like a job for lazy code motion, if we had it.
(In reply to comment #2) > Sounds like a job for lazy code motion, if we had it. I actually don't know what that is, but I mentioned this because I recall that the CUDA guys needed this and implemented it in a target-specific IR pass.
The CodeGenPrepare approach is definitely the way to go here. In fact, I wonder why it is not already doing what we want since we should have all the logic there to handle this. Reid, would you mind having a look? Just to give some information on why the rematerialization in register allocator wouldn’t help: We have to rematerialize LEAs. At best, LEAs use only one register and define one. I.e., unless the register we use is a reserved register like SP, we would still need to carry over the value of that register to the related uses (addressing mode or not). Thus that does not help the allocation problem and we still have to spill some value. The only thing we would get in that case is a reduction in spill size on the stack as the value that feeds the LEAs is the same. It is generally not true. Moreover, it may not save spill instructions. Though in that case that would likely be the case. To summarize handling that through rematerialization would mean we trade potential spill size improvement against potentially more expensive computation. Indeed, we trade a reload of the spilled LEA against a reload + LEA. I.e., A = lea B spill A, @A ... C = reload @A => spill B, @B ... D = reload @B C = lea D As a side note, currently, the register allocator only handles “simple” rematerialization, i.e., instructions that do not have any uses like load of immediate.
Relaying some IRC discussions here... (In reply to comment #4) > The CodeGenPrepare approach is definitely the way to go here. In fact, I > wonder why it is not already doing what we want since we should have all the > logic there to handle this. > Reid, would you mind having a look? So, CGP doesn't work well for a more interesting reason. CGP correctly handles the case where we have a load or store that we can fold the GEP into an addressing mode for. That's not the case we have. We have an address operand to a call. The only reason it makes sense to not spill these is that computing the values and putting them in registers with LEA takes exactly the same resources as loading them from the stack even if the load is free (same addressing unit usage). So this really seems like a case where we have an operation that can produce a value which is as cheap or cheaper than reloading that value, with no 'folding' or other factors. The only way this makes sense to do on each loop iteration is because we ran out of registers to hold the precomputed values. So I think this really needs to be more analogous to remat. > Just to give some information on why the rematerialization in register > allocator wouldn’t help: We have to rematerialize LEAs. At best, LEAs use > only one register and define one. I.e., unless the register we use is a > reserved register like SP, we would still need to carry over the value of > that register to the related uses (addressing mode or not). Thus that does > not help the allocation problem and we still have to spill some value. > The only thing we would get in that case is a reduction in spill size on the > stack as the value that feeds the LEAs is the same. It is generally not > true. Moreover, it may not save spill instructions. Though in that case that > would likely be the case. > > To summarize handling that through rematerialization would mean we trade > potential spill size improvement against potentially more expensive > computation. Indeed, we trade a reload of the spilled LEA against a reload + > LEA. > I.e., > A = lea B > spill A, @A > ... > C = reload @A > => > spill B, @B > ... > D = reload @B > C = lea D This is great example. I think there are two key reasons why this makes sense: 1) we can remat *all* the uses of this vreg. 2) there are already other users of the "base" vreg. Put another way, while if there is only one of these in the loop then we would trade a reload of the spilled LEA against a reload + LEA, when there are 2 entries in the loop, we trade 2 reloads against 1 reload + 2 LEAs. *that* makes the regalloc problem simpler in a meaningful way I think. I'm not sure if "2" or "3" should be the magical number, we can use a reasonable cutoff here, but essentially at some point reducing the number of reloads should be worth the addititional operations. This is then especially true of an LEA which uses no more of the addressing unit than a reload would (base + offset) as that instruction seems strictly no more expensive than the reload itself. Does that make more sense to you Quentin? > > As a side note, currently, the register allocator only handles “simple” > rematerialization, i.e., instructions that do not have any uses like load of > immediate.
I wrote a comment here this morning something along the lines of what Chandler said. Basically, I agree it's a remat problem because we don't have a memory access to fold into here.
(In reply to comment #5) > Relaying some IRC discussions here… Thanks Chandler for the update here! > CGP correctly handles the case where we have a load or store that we can > fold the GEP into an addressing mode for. That's not the case we have. > Ah right, I should have looked into the IR and spare the investigation on that aspect. CodeGenPrepare does, indeed, this folding for addressing modes only. > We have an address operand to a call. The only reason it makes sense to not > spill these is that computing the values and putting them in registers with > LEA takes exactly the same resources as loading them from the stack even if > the load is free (same addressing unit usage). I am not sure I follow here. In that case the LEA and the reload do not take the same resources, at least from the register allocator point of view. The LEA still needs a non-free input register (rsi), whereas the reload does not (rbp). > > So this really seems like a case where we have an operation that can produce > a value which is as cheap or cheaper than reloading that value, with no > 'folding' or other factors. Well, you still need the input value of the LEA to be available, which may not be the case where you want to rematerialize. > The only way this makes sense to do on each loop > iteration is because we ran out of registers to hold the precomputed values. > So I think this really needs to be more analogous to remat. > > > I.e., > > A = lea B > > spill A, @A > > ... > > C = reload @A > > => > > spill B, @B > > ... > > D = reload @B > > C = lea D > > This is great example. I think there are two key reasons why this makes > sense: > > 1) we can remat *all* the uses of this vreg. Wait, I guess the code feeding the register allocator something like following: vreg1 = lea base + 1 vreg2 = lea base + 2 vreg3 = lea base + 3 loop: switch 1, 2, 3 bb1: = vreg1 br end bb2: = vreg2 br end bb3: = vreg3 br end end: [exceeded register pressure] back to loop I.e., the uses of the base are fine, this is the uses of its users that are not. > 2) there are already other users of the "base" greg. But those are *all* before the point where we exceed the register pressure. Anyway, the way the register allocator works is that it tries to split the live-ranges to relax the constraints then spills if it did not work. When you get to the spill point, you try to produce the shortest possible live-ranges. Thus, you would insert the reload right before the uses. However, here to see the benefits of reloading ‘base' would imply to place the reload just before the switch and see that it is cheaper to produce the related LEAs in each case of the switch. There are several problems here: 1. We need to know that vreg1, vreg2, and vreg3 share the same computation. 2. We need to know that producing vreg1, vreg2, and vreg3 from #1 is cheap and do not need more resources. 3. We need to find a place where we can share the reload of the base. 4. We need to know that the place from #3 helps the register allocation problem. 5. Right now, the live ranges are considered one by one. For #2, we may be able to rely on some existing tablegen annotation, like isCheapAsMove but I wonder if that would be enough. #3 and #4 may simply fall from the current splitting mechanism. #5 is a particular fundamental change for the allocator and anyway, I do not think it should go in that direction (at least with our current algorithm). #1 does not seem to be rematerialization specific but instead more code motion like. My point is that it really looks like we would need some kind of global code motion here and in particular, I do not think this should be handled as rematerialization. Let me go back to the example to illustrate why. Assume we perform a register pressure aware code motion, scheduling, ..., name it. We could transform the code like this: loop: switch 1, 2, 3 bb1: vreg1 = lea base + 1 = vreg1 br end bb2: vreg2 = lea base + 2 = vreg2 br end bb3: vreg3 = lea base + 3 = vreg3 br end end: [exceeded register pressure] <— only base is alive now. back to loop Because of this change, the register pressure problem may just vanish. > Does that make more sense to you Quentin? > In my opinion, this is a code motion/global scheduling problem, not a rematerialization problem. Moreover, I do not think the register allocator, as it exists, is the right place to fix this. Just to clarify, my definitions for general rematerialization and code motion are: - General rematerialization is the ability to recompute a value from other values that are *available*. - Code motion moves instructions around based on some objective functions. In other words, remarterializing vregX from base would require that base is available, which is not the case. Code motion does not have such restrictions. Arguably, with some stretch of the definition of rematerialization we end up with a code motion definition. However, going into that direction does not seem to fit the design of the current register allocator. The bottom line is for a long-term solution, I think it would be beneficial to work toward a global scheduler with register pressure awareness (or a register allocator with scheduling awareness) than to fix that in the current register allocator. For short-term, we may be able to handle this case in the register allocator with a new splitting heuristic, not a rematerialization thing (rematerialization is too late for that IMHO). That sounds challenging and hence interesting to look into :). How important is that use case?