|
| 1 | +##### |
| 2 | +##### LLVM optimization pipeline |
| 3 | +##### |
| 4 | + |
| 5 | +# https://github.com/JuliaLang/julia/blob/2eb5da0e25756c33d1845348836a0a92984861ac/src/aotcompile.cpp#L603 |
| 6 | +function addTargetPasses!(pm, tm) |
| 7 | + add_library_info!(pm, LLVM.triple(tm)) |
| 8 | + return add_transform_info!(pm, tm) |
| 9 | +end |
| 10 | + |
| 11 | +# TODO (Missing C-API): |
| 12 | +# - https://reviews.llvm.org/D86764 adds InstSimplify |
| 13 | +# - createDivRemPairs |
| 14 | +# - createLoopLoadEliminationPass |
| 15 | +# - createVectorCombinePass |
| 16 | +# TODO (Missing LLVM.jl) |
| 17 | +# - AggressiveInstCombinePass |
| 18 | + |
| 19 | +# https://github.com/JuliaLang/julia/blob/2eb5da0e25756c33d1845348836a0a92984861ac/src/aotcompile.cpp#L620 |
| 20 | +function addOptimizationPasses!(pm, tm, opt_level, lower_intrinsics, dump_native) |
| 21 | + constant_merge!(pm) |
| 22 | + if opt_level < 2 |
| 23 | + error("opt_level less than 2 not supported") |
| 24 | + return |
| 25 | + end |
| 26 | + |
| 27 | + propagate_julia_addrsp!(pm) |
| 28 | + scoped_no_alias_aa!(pm) |
| 29 | + type_based_alias_analysis!(pm) |
| 30 | + if opt_level >= 3 |
| 31 | + basic_alias_analysis!(pm) |
| 32 | + end |
| 33 | + cfgsimplification!(pm) |
| 34 | + dce!(pm) |
| 35 | + scalar_repl_aggregates!(pm) |
| 36 | + |
| 37 | + # mem_cpy_opt!(pm) |
| 38 | + |
| 39 | + always_inliner!(pm) # Respect always_inline |
| 40 | + |
| 41 | + # Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard time |
| 42 | + # merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt` |
| 43 | + # pass. |
| 44 | + |
| 45 | + alloc_opt!(pm) |
| 46 | + # consider AggressiveInstCombinePass at optlevel > 2 |
| 47 | + |
| 48 | + instruction_combining!(pm) |
| 49 | + cfgsimplification!(pm) |
| 50 | + if dump_native |
| 51 | + error("dump_native not supported") |
| 52 | + # TODO: createMultiversoningPass |
| 53 | + end |
| 54 | + scalar_repl_aggregates!(pm) |
| 55 | + instruction_combining!(pm) # TODO: createInstSimplifyLegacy |
| 56 | + jump_threading!(pm) |
| 57 | + |
| 58 | + reassociate!(pm) |
| 59 | + |
| 60 | + early_cse!(pm) |
| 61 | + |
| 62 | + # Load forwarding above can expose allocations that aren't actually used |
| 63 | + # remove those before optimizing loops. |
| 64 | + alloc_opt!(pm) |
| 65 | + loop_rotate!(pm) |
| 66 | + # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) |
| 67 | + loop_idiom!(pm) |
| 68 | + |
| 69 | + # TODO: Polly (Quo vadis?) |
| 70 | + |
| 71 | + # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards |
| 72 | + lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop |
| 73 | + licm!(pm) |
| 74 | + julia_licm!(pm) |
| 75 | + # Subsequent passes not stripping metadata from terminator |
| 76 | + instruction_combining!(pm) # TODO: createInstSimplifyLegacy |
| 77 | + ind_var_simplify!(pm) |
| 78 | + loop_deletion!(pm) |
| 79 | + loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll |
| 80 | + |
| 81 | + # Run our own SROA on heap objects before LLVM's |
| 82 | + alloc_opt!(pm) |
| 83 | + # Re-run SROA after loop-unrolling (useful for small loops that operate, |
| 84 | + # over the structure of an aggregate) |
| 85 | + scalar_repl_aggregates!(pm) |
| 86 | + instruction_combining!(pm) # TODO: createInstSimplifyLegacy |
| 87 | + |
| 88 | + gvn!(pm) |
| 89 | + mem_cpy_opt!(pm) |
| 90 | + sccp!(pm) |
| 91 | + |
| 92 | + # Run instcombine after redundancy elimination to exploit opportunities |
| 93 | + # opened up by them. |
| 94 | + # This needs to be InstCombine instead of InstSimplify to allow |
| 95 | + # loops over Union-typed arrays to vectorize. |
| 96 | + instruction_combining!(pm) |
| 97 | + jump_threading!(pm) |
| 98 | + dead_store_elimination!(pm) |
| 99 | + |
| 100 | + # More dead allocation (store) deletion before loop optimization |
| 101 | + # consider removing this: |
| 102 | + alloc_opt!(pm) |
| 103 | + |
| 104 | + # see if all of the constant folding has exposed more loops |
| 105 | + # to simplification and deletion |
| 106 | + # this helps significantly with cleaning up iteration |
| 107 | + cfgsimplification!(pm) |
| 108 | + loop_deletion!(pm) |
| 109 | + instruction_combining!(pm) |
| 110 | + loop_vectorize!(pm) |
| 111 | + # TODO: createLoopLoadEliminationPass |
| 112 | + cfgsimplification!(pm) |
| 113 | + slpvectorize!(pm) |
| 114 | + # might need this after LLVM 11: |
| 115 | + # TODO: createVectorCombinePass() |
| 116 | + |
| 117 | + aggressive_dce!(pm) |
| 118 | + |
| 119 | + if lower_intrinsics |
| 120 | + # LowerPTLS removes an indirect call. As a result, it is likely to trigger |
| 121 | + # LLVM's devirtualization heuristics, which would result in the entire |
| 122 | + # pass pipeline being re-exectuted. Prevent this by inserting a barrier. |
| 123 | + barrier_noop!(pm) |
| 124 | + lower_exc_handlers!(pm) |
| 125 | + gc_invariant_verifier!(pm, false) |
| 126 | + # Needed **before** LateLowerGCFrame on LLVM < 12 |
| 127 | + # due to bug in `CreateAlignmentAssumption`. |
| 128 | + remove_ni!(pm) |
| 129 | + late_lower_gc_frame!(pm) |
| 130 | + final_lower_gc!(pm) |
| 131 | + # We need these two passes and the instcombine below |
| 132 | + # after GC lowering to let LLVM do some constant propagation on the tags. |
| 133 | + # and remove some unnecessary write barrier checks. |
| 134 | + gvn!(pm) |
| 135 | + sccp!(pm) |
| 136 | + # Remove dead use of ptls |
| 137 | + dce!(pm) |
| 138 | + lower_ptls!(pm, dump_native) |
| 139 | + instruction_combining!(pm) |
| 140 | + # Clean up write barrier and ptls lowering |
| 141 | + cfgsimplification!(pm) |
| 142 | + else |
| 143 | + remove_ni!(pm) |
| 144 | + end |
| 145 | + return combine_mul_add!(pm) |
| 146 | + # TODO: createDivRemPairs[] |
| 147 | +end |
| 148 | + |
| 149 | +function addMachinePasses!(pm, tm) |
| 150 | + demote_float16!(pm) |
| 151 | + return gvn!(pm) |
| 152 | +end |
| 153 | + |
| 154 | +function run_pipeline!(mod::LLVM.Module) |
| 155 | + LLVM.ModulePassManager() do pm |
| 156 | + addTargetPasses!(pm, tm[]) |
| 157 | + addOptimizationPasses!(pm, tm[], 3, true, false) |
| 158 | + addMachinePasses!(pm, tm[]) |
| 159 | + return run!(pm, mod) |
| 160 | + end |
| 161 | +end |
0 commit comments