[llvm cookbook] IR优化

本文介绍如何使用opt工具优化llvm ir。

使用之前编写的代码 multiply.c

int mult() {
int a = 5;
int b = 3;
int c = a * b;
return c;
}

执行命令

clang -emit-llvm -S multiply.c -o multiply.ll

生成 multiply.ll

; ModuleID = 'multiply.c'
source_filename = "multiply.c"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.14.0"

; Function Attrs: noinline nounwind optnone ssp uwtable
define i32 @mult() #0 {
entry:
%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
store i32 5, i32* %a, align 4
store i32 3, i32* %b, align 4
%0 = load i32, i32* %a, align 4
%1 = load i32, i32* %b, align 4
%mul = mul nsw i32 %0, %1
store i32 %mul, i32* %c, align 4
%2 = load i32, i32* %c, align 4
ret i32 %2
}

attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 7.0.0 (trunk 324834)"}

执行命令:

opt -mem2reg -S multiply.ll -o multiply1.ll

输出 multiply1.ll

; ModuleID = 'multiply.ll'
source_filename = "multiply.c"
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.14.0"

; Function Attrs: noinline nounwind optnone ssp uwtable
define i32 @mult() #0 {
entry:
%a = alloca i32, align 4
%b = alloca i32, align 4
%c = alloca i32, align 4
store i32 5, i32* %a, align 4
store i32 3, i32* %b, align 4
%0 = load i32, i32* %a, align 4
%1 = load i32, i32* %b, align 4
%mul = mul nsw i32 %0, %1
store i32 %mul, i32* %c, align 4
%2 = load i32, i32* %c, align 4
ret i32 %2
}

attributes #0 = { noinline nounwind optnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{!"clang version 7.0.0 (trunk 324834)"}

我在mac上执行的,不知道为什么结果和书上的不一致,这个优化好像并没有生效。

其他的优化选项还有

  • adce: Aggressive Dead Code Elimination
  • bb-vectorize: Basic-Block Vectorization
  • constprop: Simple constant propagation
  • dce: Dead Code Elimination
  • deadargelim: Dead Argument Elimination
  • globaldce: Dead Global Elimination
  • globalopt: Global Variable Optimizer
  • gvn: Global Value Numbering
  • inline: Function Integration/Inlining
  • instcombine: Combine redundant instructions
  • licm: Loop Invariant Code Motion
  • loop: unswitch: Unswithch Loop
  • loweratomic: Lower atomic intrinsics to non-atomic form
  • lowerinvoke: Lower invokes to calls, for unwindless code generators
  • lowerswitch: Lower SwithcInsts to branches
  • mem2reg: Promote Memory to Registry
  • memcpyopt: MemCpy Optimization
  • simplifycfg: Simplify the CFG
  • sink: Code sinking
  • tailcallelim: Tail Call Elimination

可以在源码目录 test/Transforms/ 下找到测试代码。