diff --git a/src/compiler/optimize.jl b/src/compiler/optimize.jl index 839f7cd2db..027f1b22b7 100644 --- a/src/compiler/optimize.jl +++ b/src/compiler/optimize.jl @@ -9,608 +9,335 @@ LLVM.@module_pass "preserve-nvvm-end" PreserveNVVMEndPass const RunAttributor = Ref(true) -@static if VERSION < v"1.11.0-DEV.428" -else - barrier_noop!(pm) = nothing +function enzyme_attributor_pass!(mod::LLVM.Module) + ccall( + (:RunAttributorOnModule, API.libEnzyme), + Cvoid, + (LLVM.API.LLVMModuleRef,), + mod, + ) + return true end -@static if VERSION < v"1.11-" - function gc_invariant_verifier_tm!(pm::ModulePassManager, tm::LLVM.TargetMachine, cond::Bool) - gc_invariant_verifier!(pm, cond) - end -else - function gc_invariant_verifier_tm!(pm::ModulePassManager, tm::LLVM.TargetMachine, cond::Bool) - function gc_invariant_verifier(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, GCInvariantVerifierPass(; strong = cond)) - end - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("GCInvariantVerifier", gc_invariant_verifier)) - end -end +EnzymeAttributorPass() = NewPMModulePass("enzyme_attributor", enzyme_attributor_pass!) +ReinsertGCMarkerPass() = NewPMFunctionPass("reinsert_gcmarker", reinsert_gcmarker_pass!) +SafeAtomicToRegularStorePass() = NewPMFunctionPass("safe_atomic_to_regular_store", safe_atomic_to_regular_store!) +Addr13NoAliasPass() = NewPMModulePass("addr13_noalias", addr13NoAlias) +RewriteGenericMemoryPass() = NewPMModulePass("rewrite_generic_memory", rewrite_generic_memory!) -@static if VERSION < v"1.11-" - function propagate_julia_addrsp_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - propagate_julia_addrsp!(pm) - end -else - function propagate_julia_addrsp_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function prop_julia_addr(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, PropagateJuliaAddrspacesPass()) - end - end - run!(pb, mod) - end - return true +function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) + @dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + register!(pb, Addr13NoAliasPass()) + add!(pb, NewPMAAManager()) do aam + add!(aam, ScopedNoAliasAA()) + add!(aam, TypeBasedAA()) + add!(aam, BasicAA()) end - add!(pm, ModulePass("PropagateJuliaAddrSpace", prop_julia_addr)) - end -end + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, Addr13NoAliasPass()) -@static if VERSION < v"1.11-" - function alloc_opt_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - alloc_opt!(pm) - end -else - function alloc_opt_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function alloc_opt(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, AllocOptPass()) - end - end - run!(pb, mod) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, PropagateJuliaAddrspacesPass()) + add!(fpm, SimplifyCFGPass()) + add!(fpm, DCEPass()) end - return true - end - add!(pm, ModulePass("AllocOpt", alloc_opt)) - end -end - -@static if VERSION < v"1.11-" - function remove_ni_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - remove_ni!(pm) - end -else - function remove_ni_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function remove_ni(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, RemoveNIPass()) - end - run!(pb, mod) + add!(mpm, CPUFeaturesPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, SROAPass()) + add!(fpm, MemCpyOptPass()) end - return true - end - add!(pm, ModulePass("RemoveNI", remove_ni)) - end -end - -@static if VERSION < v"1.11-" - function julia_licm_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - julia_licm!(pm) - end -else - function julia_licm_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function julia_licm(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, NewPMLoopPassManager()) do lpm - add!(lpm, JuliaLICMPass()) - end - end - end - run!(pb, mod) + add!(mpm, AlwaysInlinerPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, AllocOptPass()) end - return true end - # really looppass - add!(pm, ModulePass("JuliaLICM", julia_licm)) + run!(pb, mod, tm) end -end -@static if VERSION < v"1.11-" - function lower_simdloop_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - lower_simdloop!(pm) - end -else - function lower_simdloop_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function lower_simdloop(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, NewPMLoopPassManager()) do lpm - add!(lpm, LowerSIMDLoopPass()) - end - end - end - run!(pb, mod) + # Globalopt is separated as it can delete functions, which invalidates the Julia hardcoded pointers to + # known functions + @dispose pb = NewPMPassBuilder() begin + add!(pb, NewPMAAManager()) do aam + add!(aam, ScopedNoAliasAA()) + add!(aam, TypeBasedAA()) + add!(aam, BasicAA()) + end + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, CPUFeaturesPass()) # why is this duplicated? + add!(mpm, GlobalOptPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, GVNPass()) end - return true end - # really looppass - add!(pm, ModulePass("LowerSIMDLoop", lower_simdloop)) + run!(pb, mod, tm) end -end - -function loop_optimizations_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - @static if true || VERSION < v"1.11-" - lower_simdloop_tm!(pm, tm) - licm!(pm) - if LLVM.version() >= v"15" - simple_loop_unswitch_legacy!(pm) - else - loop_unswitch!(pm) + function middle_optimize!(second_stage=false) + @dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + register!(pb, RewriteGenericMemoryPass()) + add!(pb, NewPMAAManager()) do aam + add!(aam, ScopedNoAliasAA()) + add!(aam, TypeBasedAA()) + add!(aam, BasicAA()) end - else - @assert false - end -end - - -function more_loop_optimizations_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - @static if true || VERSION < v"1.11-" - loop_rotate!(pm) - # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) - loop_idiom!(pm) - - # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards - lower_simdloop_tm!(pm, tm) # Annotate loop marked with "loopinfo" as LLVM parallel loop - licm!(pm) - julia_licm_tm!(pm, tm) - # Subsequent passes not stripping metadata from terminator - instruction_combining!(pm) # TODO: createInstSimplifyLegacy - jl_inst_simplify!(pm) - - ind_var_simplify!(pm) - loop_deletion!(pm) - loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll - else - @assert false - end -end - -@static if VERSION < v"1.11-" - function demote_float16_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - demote_float16!(pm) - end -else - function demote_float16_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function demote_float16(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, DemoteFloat16Pass()) - end + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, RewriteGenericMemoryPass()) + add!(mpm, CPUFeaturesPass()) # why is this duplicated? + + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, SimplifyCFGPass()) + add!(fpm, SROAPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, JumpThreadingPass()) + add!(fpm, CorrelatedValuePropagationPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, ReassociatePass()) + add!(fpm, EarlyCSEPass()) + add!(fpm, AllocOptPass()) + + add!(fpm, NewPMLoopPassManager(use_memory_ssa=true)) do lpm + add!(lpm, LoopIdiomRecognizePass()) + add!(lpm, LoopRotatePass()) + add!(lpm, LowerSIMDLoopPass()) + add!(lpm, LICMPass()) + add!(lpm, JuliaLICMPass()) + add!(lpm, SimpleLoopUnswitchPass()) end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("DemoteFloat16", demote_float16)) - end -end -@static if VERSION < v"1.11-" - function lower_exc_handlers_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - lower_exc_handlers!(pm) - end -else - function lower_exc_handlers_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function lower_exc_handlers(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, LowerExcHandlersPass()) - end + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, IndVarSimplifyPass()) + add!(lpm, LoopDeletionPass()) end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("LowerExcHandlers", lower_exc_handlers)) - end -end - -@static if VERSION < v"1.11-" - function lower_ptls_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine, dump_native::Bool) - lower_ptls!(pm, dump_native) - end -else - function lower_ptls_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine, dump_native::Bool) - function lower_ptls(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, LowerPTLSPass()) + add!(fpm, LoopUnrollPass(opt_level=2)) # what opt level? + add!(fpm, AllocOptPass()) + add!(fpm, SROAPass()) + add!(fpm, GVNPass()) + + # This InstCombine needs to be after GVN + # Otherwise it will generate load chains in GPU code... + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, MemCpyOptPass()) + add!(fpm, SCCPPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, JumpThreadingPass()) + add!(fpm, DSEPass()) + add!(fpm, AllocOptPass()) + add!(fpm, SimplifyCFGPass()) + + + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, LoopIdiomRecognizePass()) + add!(lpm, LoopDeletionPass()) end - run!(pb, mod) + add!(fpm, JumpThreadingPass()) + add!(fpm, CorrelatedValuePropagationPass()) + if second_stage + + add!(fpm, ADCEPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + + # GC passes + add!(fpm, GCInvariantVerifierPass(strong=false)) + add!(fpm, SimplifyCFGPass()) + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + end # second_stage end - return true end - add!(pm, ModulePass("LowerPTLS", lower_ptls)) + run!(pb, mod, tm) end -end + end # middle_optimize! -@static if VERSION < v"1.11-" - function combine_mul_add_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - combine_mul_add!(pm) - end -else - function combine_mul_add_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) -@static if VERSION < v"1.12.0-DEV.1390" - function combine_mul_add(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, CombineMulAddPass()) - end - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("CombineMulAdd", combine_mul_add)) -end - end -end + middle_optimize!() + middle_optimize!(true) -@static if VERSION < v"1.11-" - function late_lower_gc_frame_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - late_lower_gc_frame!(pm) - end -else - function late_lower_gc_frame_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function late_lower_gc_frame(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, LateLowerGCPass()) - end - end - run!(pb, mod) - end - return true + # Globalopt is separated as it can delete functions, which invalidates the Julia hardcoded pointers to + # known functions + @dispose pb = NewPMPassBuilder() begin + add!(pb, NewPMAAManager()) do aam + add!(aam, ScopedNoAliasAA()) + add!(aam, TypeBasedAA()) + add!(aam, BasicAA()) end - add!(pm, ModulePass("LateLowerGCFrame", late_lower_gc_frame)) - end -end - -@static if VERSION < v"1.11-" - function final_lower_gc_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - final_lower_gc!(pm) - end -else - function final_lower_gc_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function final_lower_gc(mod::LLVM.Module) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, NewPMFunctionPassManager()) do fpm - add!(fpm, FinalLowerGCPass()) - end - end - run!(pb, mod) + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, GlobalOptPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, GVNPass()) end - return true end - add!(pm, ModulePass("FinalLowerGCFrame", final_lower_gc)) + run!(pb, mod, tm) end + removeDeadArgs!(mod, tm) + detect_writeonly!(mod) + nodecayed_phis!(mod) end -@static if VERSION < v"1.11-" - function cpu_features_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - @static if isdefined(LLVM.Interop, :cpu_features!) - LLVM.Interop.cpu_features!(pm) - else - @static if isdefined(GPUCompiler, :cpu_features!) - GPUCompiler.cpu_features!(pm) - end - end +function addOptimizationPasses!(mpm::LLVM.NewPMPassManager) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, ReinsertGCMarkerPass()) end -else - function cpu_features_tm!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - function cpu_features(mod) - @dispose pb = NewPMPassBuilder() begin - add!(pb, NewPMModulePassManager()) do mpm - add!(mpm, CPUFeaturesPass()) - end - run!(pb, mod) - end - return true - end - add!(pm, ModulePass("CPUFeatures", cpu_features)) - end -end -function jl_inst_simplify!(PM::LLVM.ModulePassManager) - ccall( - (:LLVMAddJLInstSimplifyPass, API.libEnzyme), - Cvoid, - (LLVM.API.LLVMPassManagerRef,), - PM, - ) -end + add!(mpm, ConstantMergePass()) -cse!(pm) = LLVM.API.LLVMAddEarlyCSEPass(pm) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, PropagateJuliaAddrspacesPass()) -function optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine) - addr13NoAlias(mod) - # everying except unroll, slpvec, loop-vec - # then finish Julia GC - ModulePassManager() do pm - add_library_info!(pm, triple(mod)) - add_transform_info!(pm, tm) - - propagate_julia_addrsp_tm!(pm, tm) - scoped_no_alias_aa!(pm) - type_based_alias_analysis!(pm) - basic_alias_analysis!(pm) - cfgsimplification!(pm) - dce!(pm) - cpu_features_tm!(pm, tm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - mem_cpy_opt!(pm) - always_inliner!(pm) - alloc_opt_tm!(pm, tm) - LLVM.run!(pm, mod) + add!(fpm, SimplifyCFGPass()) + add!(fpm, DCEPass()) + add!(fpm, SROAPass()) end - # Globalopt is separated as it can delete functions, which invalidates the Julia hardcoded pointers to - # known functions - ModulePassManager() do pm + add!(mpm, AlwaysInlinerPass()) - add_library_info!(pm, triple(mod)) - add_transform_info!(pm, tm) + add!(mpm, NewPMFunctionPassManager()) do fpm + # Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard time + # merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt` + # pass. - scoped_no_alias_aa!(pm) - type_based_alias_analysis!(pm) - basic_alias_analysis!(pm) - cpu_features_tm!(pm, tm) + add!(fpm, AllocOptPass()) + # consider AggressiveInstCombinePass at optlevel > 2 - LLVM.API.LLVMAddGlobalOptimizerPass(pm) # Extra - gvn!(pm) # Extra - LLVM.run!(pm, mod) - end + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, SimplifyCFGPass()) + add!(fpm, SROAPass()) + add!(fpm, InstSimplifyPass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, JumpThreadingPass()) + add!(fpm, CorrelatedValuePropagationPass()) - rewrite_generic_memory!(mod) - - ModulePassManager() do pm - add_library_info!(pm, triple(mod)) - add_transform_info!(pm, tm) - - scoped_no_alias_aa!(pm) - type_based_alias_analysis!(pm) - basic_alias_analysis!(pm) - cpu_features_tm!(pm, tm) - - instruction_combining!(pm) - jl_inst_simplify!(pm) - cfgsimplification!(pm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - instruction_combining!(pm) - jl_inst_simplify!(pm) - jump_threading!(pm) - correlated_value_propagation!(pm) - instruction_combining!(pm) - jl_inst_simplify!(pm) - reassociate!(pm) - early_cse!(pm) - alloc_opt_tm!(pm, tm) - loop_idiom!(pm) - loop_rotate!(pm) - - loop_optimizations_tm!(pm, tm) - - instruction_combining!(pm) - jl_inst_simplify!(pm) - ind_var_simplify!(pm) - loop_deletion!(pm) - loop_unroll!(pm) - alloc_opt_tm!(pm, tm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - gvn!(pm) - - # This InstCombine needs to be after GVN - # Otherwise it will generate load chains in GPU code... - instruction_combining!(pm) - jl_inst_simplify!(pm) - mem_cpy_opt!(pm) - sccp!(pm) - instruction_combining!(pm) - jl_inst_simplify!(pm) - jump_threading!(pm) - dead_store_elimination!(pm) - alloc_opt_tm!(pm, tm) - cfgsimplification!(pm) - loop_idiom!(pm) - loop_deletion!(pm) - jump_threading!(pm) - correlated_value_propagation!(pm) - # SLP_Vectorizer -- not for Enzyme - - LLVM.run!(pm, mod) - - aggressive_dce!(pm) - instruction_combining!(pm) - jl_inst_simplify!(pm) - # Loop Vectorize -- not for Enzyme - # InstCombine - - # GC passes - barrier_noop!(pm) - gc_invariant_verifier_tm!(pm, tm, false) - - # FIXME: Currently crashes printing - cfgsimplification!(pm) - instruction_combining!(pm) # Extra for Enzyme - jl_inst_simplify!(pm) - LLVM.run!(pm, mod) - end - - # Globalopt is separated as it can delete functions, which invalidates the Julia hardcoded pointers to - # known functions - ModulePassManager() do pm - add_library_info!(pm, triple(mod)) - add_transform_info!(pm, tm) - - scoped_no_alias_aa!(pm) - type_based_alias_analysis!(pm) - basic_alias_analysis!(pm) - cpu_features_tm!(pm, tm) - - LLVM.API.LLVMAddGlobalOptimizerPass(pm) # Exxtra - gvn!(pm) # Exxtra - LLVM.run!(pm, mod) - end - removeDeadArgs!(mod, tm) - detect_writeonly!(mod) - nodecayed_phis!(mod) -end + add!(fpm, ReassociatePass()) + add!(fpm, EarlyCSEPass()) -# https://github.com/JuliaLang/julia/blob/2eb5da0e25756c33d1845348836a0a92984861ac/src/aotcompile.cpp#L603 -function addTargetPasses!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine, trip::String) - add_library_info!(pm, trip) - add_transform_info!(pm, tm) -end + # Load forwarding above can expose allocations that aren't actually used + # remove those before optimizing loops. + add!(fpm, AllocOptPass()) -# https://github.com/JuliaLang/julia/blob/2eb5da0e25756c33d1845348836a0a92984861ac/src/aotcompile.cpp#L620 -function addOptimizationPasses!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - add!(pm, FunctionPass("ReinsertGCMarker", reinsert_gcmarker_pass!)) - - constant_merge!(pm) - - propagate_julia_addrsp_tm!(pm, tm) - scoped_no_alias_aa!(pm) - type_based_alias_analysis!(pm) - basic_alias_analysis!(pm) - cfgsimplification!(pm) - dce!(pm) - scalar_repl_aggregates!(pm) - - # mem_cpy_opt!(pm) - - always_inliner!(pm) # Respect always_inline - - # Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard time - # merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt` - # pass. - - alloc_opt_tm!(pm, tm) - # consider AggressiveInstCombinePass at optlevel > 2 - - instruction_combining!(pm) - jl_inst_simplify!(pm) - cfgsimplification!(pm) - scalar_repl_aggregates!(pm) - instruction_combining!(pm) # TODO: createInstSimplifyLegacy - jl_inst_simplify!(pm) - jump_threading!(pm) - correlated_value_propagation!(pm) - - reassociate!(pm) - - early_cse!(pm) - - # Load forwarding above can expose allocations that aren't actually used - # remove those before optimizing loops. - alloc_opt_tm!(pm, tm) - - more_loop_optimizations_tm!(pm, tm) - - # Run our own SROA on heap objects before LLVM's - alloc_opt_tm!(pm, tm) - # Re-run SROA after loop-unrolling (useful for small loops that operate, - # over the structure of an aggregate) - scalar_repl_aggregates!(pm) - instruction_combining!(pm) # TODO: createInstSimplifyLegacy - jl_inst_simplify!(pm) - - gvn!(pm) - mem_cpy_opt!(pm) - sccp!(pm) - - # Run instcombine after redundancy elimination to exploit opportunities - # opened up by them. - # This needs to be InstCombine instead of InstSimplify to allow - # loops over Union-typed arrays to vectorize. - instruction_combining!(pm) - jl_inst_simplify!(pm) - jump_threading!(pm) - dead_store_elimination!(pm) - add!(pm, FunctionPass("SafeAtomicToRegularStore", safe_atomic_to_regular_store!)) - - # More dead allocation (store) deletion before loop optimization - # consider removing this: - alloc_opt_tm!(pm, tm) - - # see if all of the constant folding has exposed more loops - # to simplification and deletion - # this helps significantly with cleaning up iteration - cfgsimplification!(pm) - loop_deletion!(pm) - instruction_combining!(pm) - jl_inst_simplify!(pm) - loop_vectorize!(pm) - # TODO: createLoopLoadEliminationPass - cfgsimplification!(pm) - slpvectorize!(pm) - # might need this after LLVM 11: - # TODO: createVectorCombinePass() - - aggressive_dce!(pm) -end + add!(fpm, NewPMLoopPassManager(use_memory_ssa=true)) do lpm + add!(lpm, LoopRotatePass()) + # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) + add!(lpm, LoopIdiomRecognizePass()) -function addMachinePasses!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine) - combine_mul_add_tm!(pm, tm) - # TODO: createDivRemPairs[] + # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards + add!(lpm, LowerSIMDLoopPass()) # Annotate loop marked with "loopinfo" as LLVM parallel loop + add!(lpm, LICMPass()) + add!(lpm, JuliaLICMPass()) + end + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, IndVarSimplifyPass()) + add!(lpm, LoopDeletionPass()) + end + add!(fpm, LoopUnrollPass(opt_level=2)) + + # Run our own SROA on heap objects before LLVM's + add!(fpm, AllocOptPass()) + # Re-run SROA after loop-unrolling (useful for small loops that operate, + # over the structure of an aggregate) + add!(fpm, SROAPass()) + add!(fpm, InstSimplifyPass()) + + add!(fpm, GVNPass()) + add!(fpm, MemCpyOptPass()) + add!(fpm, SCCPPass()) + + # Run instcombine after redundancy elimination to exploit opportunities + # opened up by them. + # This needs to be InstCombine instead of InstSimplify to allow + # loops over Union-typed arrays to vectorize. + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, JumpThreadingPass()) + add!(fpm, DSEPass()) + add!(fpm, SafeAtomicToRegularStorePass()) + + # More dead allocation (store) deletion before loop optimization + # consider removing this: + add!(fpm, AllocOptPass()) + + # see if all of the constant folding has exposed more loops + # to simplification and deletion + # this helps significantly with cleaning up iteration + add!(fpm, SimplifyCFGPass()) + add!(fpm, NewPMLoopPassManager()) do lpm + add!(lpm, LoopDeletionPass()) + end + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, LoopVectorizePass()) + add!(fpm, SimplifyCFGPass()) + add!(fpm, SLPVectorizerPass()) + add!(fpm, ADCEPass()) + end +end - demote_float16_tm!(pm, tm) - gvn!(pm) +function addMachinePasses!(mpm::LLVM.NewPMPassManager) + add!(mpm, NewPMFunctionPassManager()) do fpm + if VERSION < v"1.12.0-DEV.1390" + add!(fpm, CombineMulAddPass()) + end + add!(fpm, DivRemPairsPass()) + add!(fpm, DemoteFloat16Pass()) + add!(fpm, GVNPass()) + end end -function addJuliaLegalizationPasses!(pm::LLVM.ModulePassManager, tm::LLVM.TargetMachine, lower_intrinsics::Bool = true) +function addJuliaLegalizationPasses!(mpm::LLVM.NewPMPassManager, lower_intrinsics::Bool = true) if lower_intrinsics - # LowerPTLS removes an indirect call. As a result, it is likely to trigger - # LLVM's devirtualization heuristics, which would result in the entire - # pass pipeline being re-exectuted. Prevent this by inserting a barrier. - barrier_noop!(pm) - add!(pm, FunctionPass("ReinsertGCMarker", reinsert_gcmarker_pass!)) - lower_exc_handlers_tm!(pm, tm) - # BUDE.jl demonstrates a bug here TODO - gc_invariant_verifier_tm!(pm, tm, false) - verifier!(pm) - - # Needed **before** LateLowerGCFrame on LLVM < 12 - # due to bug in `CreateAlignmentAssumption`. - remove_ni_tm!(pm, tm) - late_lower_gc_frame_tm!(pm, tm) - final_lower_gc_tm!(pm, tm) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, ReinsertGCMarkerPass()) + if VERSION < v"1.13.0-DEV.36" + add!(fpm, LowerExcHandlersPass()) + end + # TODO: strong=false? + add!(fpm, GCInvariantVerifierPass()) + end + add!(mpm, VerifierPass()) + add!(mpm, RemoveNIPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, LateLowerGCPass()) + if VERSION >= v"1.11.0-DEV.208" + add!(fpm, FinalLowerGCPass()) + end + end + if VERSION < v"1.11.0-DEV.208" + add!(mpm, FinalLowerGCPass()) + end # We need these two passes and the instcombine below # after GC lowering to let LLVM do some constant propagation on the tags. - # and remove some unnecessary write barrier checks. - gvn!(pm) - sccp!(pm) - # Remove dead use of ptls - dce!(pm) - lower_ptls_tm!(pm, tm, false) #=dump_native=# - instruction_combining!(pm) - jl_inst_simplify!(pm) + # and remove some unnecessary write barrier checks. + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, GVNPass()) + add!(fpm, SCCPPass()) + # Remove dead use of ptls + add!(fpm, DCEPass()) + end + add!(mpm, LowerPTLSPass()) # Clean up write barrier and ptls lowering - cfgsimplification!(pm) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + aggressiveSimplifyCFGOptions = + (forward_switch_cond=true, + switch_range_to_icmp=true, + switch_to_lookup=true, + hoist_common_insts=true) + add!(fpm, SimplifyCFGPass(; aggressiveSimplifyCFGOptions...)) + end else - barrier_noop!(pm) - remove_ni_tm!(pm, tm) + add!(mpm, RemoveNIPass()) end end @@ -646,19 +373,25 @@ function post_optimize!(mod::LLVM.Module, tm::LLVM.TargetMachine, machine::Bool ), ) end - LLVM.ModulePassManager() do pm - addTargetPasses!(pm, tm, LLVM.triple(mod)) - addOptimizationPasses!(pm, tm) - LLVM.run!(pm, mod) - end - if machine - # TODO enable validate_return_roots - # validate_return_roots!(mod) - LLVM.ModulePassManager() do pm - addJuliaLegalizationPasses!(pm, tm, true) - addMachinePasses!(pm, tm) - LLVM.run!(pm, mod) + @dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + register!(pb, ReinsertGCMarkerPass()) + register!(pb, SafeAtomicToRegularStorePass()) + add!(pb, NewPMAAManager()) do aam + add!(aam, ScopedNoAliasAA()) + add!(aam, TypeBasedAA()) + add!(aam, BasicAA()) + end + add!(pb, NewPMModulePassManager()) do mpm + addOptimizationPasses!(mpm) + if machine + # TODO enable validate_return_roots + # validate_return_roots!(mod) + addJuliaLegalizationPasses!(mpm, true) + addMachinePasses!(mpm) + end end + run!(pb, mod, tm) end for f in functions(mod) if isempty(blocks(f)) diff --git a/src/llvm/transforms.jl b/src/llvm/transforms.jl index 7e141d2f02..5462fd43ca 100644 --- a/src/llvm/transforms.jl +++ b/src/llvm/transforms.jl @@ -299,6 +299,7 @@ function addr13NoAlias(mod::LLVM.Module) end end end + return true end ## given code like @@ -2369,31 +2370,33 @@ function checkNoAssumeFalse(mod::LLVM.Module, shouldshow::Bool = false) end function rewrite_generic_memory!(mod::LLVM.Module) -@static if VERSION < v"1.11-" -else - for f in functions(mod), bb in blocks(f) - iter = LLVM.API.LLVMGetFirstInstruction(bb) - while iter != C_NULL - inst = LLVM.Instruction(iter) - iter = LLVM.API.LLVMGetNextInstruction(iter) - if !isa(inst, LLVM.LoadInst) - continue - end - - if isa(operands(inst)[1], LLVM.ConstantExpr) + @static if VERSION < v"1.11-" + return false + else + for f in functions(mod), bb in blocks(f) + iter = LLVM.API.LLVMGetFirstInstruction(bb) + while iter != C_NULL + inst = LLVM.Instruction(iter) + iter = LLVM.API.LLVMGetNextInstruction(iter) + if !isa(inst, LLVM.LoadInst) + continue + end + + if isa(operands(inst)[1], LLVM.ConstantExpr) legal2, obj = absint(inst) if legal2 && obj isa Memory && obj == typeof(obj).instance - b = LLVM.IRBuilder() - position!(b, inst) - replace_uses!(inst, unsafe_to_llvm(b, obj)) - LLVM.API.LLVMInstructionEraseFromParent(inst) - continue - end - end - end + b = LLVM.IRBuilder() + position!(b, inst) + replace_uses!(inst, unsafe_to_llvm(b, obj)) + LLVM.API.LLVMInstructionEraseFromParent(inst) + continue + end + end + end + end + return true end end -end function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) # We need to run globalopt first. This is because remove dead args will otherwise @@ -2558,37 +2561,47 @@ function removeDeadArgs!(mod::LLVM.Module, tm::LLVM.TargetMachine) end end propagate_returned!(mod) - ModulePassManager() do pm - instruction_combining!(pm) - jl_inst_simplify!(pm) - alloc_opt_tm!(pm, tm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - cse!(pm) - LLVM.run!(pm, mod) + LLVM.@dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, AllocOptPass()) + add!(fpm, SROAPass()) + add!(fpm, EarlyCSEPass()) + end + end + LLVM.run!(pb, mod) end propagate_returned!(mod) pre_attr!(mod, RunAttributor[]) if RunAttributor[] - if LLVM.version().major >= 13 - ModulePassManager() do pm - API.EnzymeAddAttributorLegacyPass(pm) - LLVM.run!(pm, mod) + LLVM.@dispose pb = NewPMPassBuilder() begin + register!(pb, EnzymeAttributorPass()) + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, EnzymeAttributorPass()) end + LLVM.run!(pb, mod) end end propagate_returned!(mod) - ModulePassManager() do pm - instruction_combining!(pm) - jl_inst_simplify!(pm) - alloc_opt_tm!(pm, tm) - scalar_repl_aggregates_ssa!(pm) # SSA variant? - if RunAttributor[] - if LLVM.version().major >= 13 - API.EnzymeAddAttributorLegacyPass(pm) + LLVM.@dispose pb = NewPMPassBuilder() begin + registerEnzymeAndPassPipeline!(pb) + register!(pb, EnzymeAttributorPass()) + add!(pb, NewPMModulePassManager()) do mpm + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, InstCombinePass()) + add!(fpm, JLInstSimplifyPass()) + add!(fpm, AllocOptPass()) + add!(fpm, SROAPass()) + end + add!(mpm, EnzymeAttributorPass()) + add!(mpm, NewPMFunctionPassManager()) do fpm + add!(fpm, EarlyCSEPass()) end end - cse!(pm) - LLVM.run!(pm, mod) + LLVM.run!(pb, mod) end post_attr!(mod, RunAttributor[]) propagate_returned!(mod)