QuantumBFS
diff --git a/‎.github/workflows/CI.yml‎
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/CI.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎Project.toml‎
Lines changed: 12 additions & 8 deletions b/‎Project.toml‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎src/YaoCompiler.jl‎
Lines changed: 58 additions & 104 deletions b/‎src/YaoCompiler.jl‎
Lines changed: 58 additions & 104 deletions
diff --git a/‎src/codegen/llvmopt.jl‎
Lines changed: 161 additions & 0 deletions b/‎src/codegen/llvmopt.jl‎
Lines changed: 161 additions & 0 deletions
@@ -10,8 +10,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6.0-beta1'
-          - 'nightly'
+          - '1.6'
         os:
           - ubuntu-latest
         arch:
 
@@ -1,24 +1,28 @@
 name = "YaoCompiler"
 uuid = "b3514fdc-7bcc-425b-920a-42c43c8eb4b4"
-authors = ["Roger-luo"]
+authors = ["Roger-Luo <rogerluo.rl18@gmail.com> and contributors"]
 version = "0.1.0"
 
 [deps]
-BitBasis = "50ba71b6-fa0f-514d-ae9a-0916efc90dcf"
-ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+CompilerPluginTools = "6b7a57c9-7cc1-4fdf-b7f5-e857abae3638"
+Configurations = "5218b696-f38b-4ac9-8b61-a12ec717816d"
+Expronicon = "6b7a57c9-7cc1-4fdf-b7f5-e857abae3636"
+GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
+LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-LuxurySparse = "d05aeea4-b7d4-55ac-b691-9e7fabb07ba2"
 MLStyle = "d8e11817-5142-5d16-987a-aa16d5891078"
-OpenQASM = "a8821629-a4c0-4df7-9e00-12969ff383a7"
-RBNF = "83ef0002-5b9e-11e9-219b-65bac3c6d69c"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 YaoAPI = "0843a435-28de-4971-9e8b-a9641b2983a8"
 YaoLocations = "66df03fb-d475-48f7-b449-3d9064bf085b"
 ZXCalculus = "3525faa3-032d-4235-a8d4-8c2939a218dd"
 
 [compat]
-ExprTools = "0.1"
-LuxurySparse = "0.6"
+CompilerPluginTools = "0.1"
+Configurations = "0.15"
+Expronicon = "0.6"
+GPUCompiler = "0.11"
+LLVM = "3.6"
+MLStyle = "0.4"
 TimerOutputs = "0.5"
 YaoAPI = "0.1"
 YaoLocations = "0.1"
 
@@ -1,123 +1,77 @@
 module YaoCompiler
 
-export @device, @gate, @ctrl, @measure, @barrier
-# reflections
-export @code_yao, @code_qasm
-export gate_count
-export Intrinsics
+export @device, @gate, @ctrl, @measure, @barrier,
+    compile,
+    YaoInterpreter,
+    YaoCompileTarget,
+    JLGenericTarget,
+    TargetHostKernel,
+    HardwareFreeOptions,
+    Routine,
+    GenericRoutine,
+    IntrinsicRoutine,
+    Operation,
+    AdjointOperation,
+    routine_name,
+    IntrinsicError,
+    # reexport YaoLocations
+    Locations,
+    CtrlLocations
 
+using MLStyle
+using YaoAPI
+using LLVM
+using Expronicon
+using YaoLocations
+using TimerOutputs
 using LinearAlgebra
+using GPUCompiler
+using Configurations
+using CompilerPluginTools
+using LLVM.Interop
+using GPUCompiler: CodeCache, CompilerJob, AbstractCompilerTarget, AbstractCompilerParams, WorldView
+using YaoLocations: map_check, map_check_nothrow, map_error, plain, unsafe_mapping
+using CompilerPluginTools: Argument
+using Base.Meta: ParseError
 
-using TimerOutputs
 const to = TimerOutput()
 timings() = (TimerOutputs.print_timer(to); println())
 enable_timings() = (TimerOutputs.enable_debug_timings(Compiler); return)
 
-using ExprTools
-using MLStyle
-using YaoAPI
-using BitBasis
-using ZXCalculus
-using YaoLocations
-using YaoLocations: map_check, map_check_nothrow, map_error, plain
-# include("runtime/locations.jl")
-
-using Core:
-    CodeInfo,
-    SSAValue,
-    Const,
-    PartialStruct,
-    Slot,
-    GotoIfNot,
-    GotoNode,
-    SlotNumber,
-    Argument,
-    ReturnNode
-using Core.Compiler:
-    InferenceParams,
-    InferenceResult,
-    OptimizationParams,
-    OptimizationState,
-    Bottom,
-    AbstractInterpreter,
-    VarTable,
-    InferenceState,
-    CFG,
-    NewSSAValue,
-    IRCode,
-    InstructionStream,
-    CallMeta
-using Core.Compiler:
-    get_world_counter,
-    get_inference_cache,
-    may_optimize,
-    isconstType,
-    isconcretetype,
-    widenconst,
-    isdispatchtuple,
-    isinlineable,
-    is_inlineable_constant,
-    copy_exprargs,
-    convert_to_ircode,
-    coverage_enabled,
-    argtypes_to_type,
-    userefs,
-    UseRefIterator,
-    UseRef,
-    MethodResultPure,
-    is_pure_intrinsic_infer,
-    intrinsic_nothrow,
-    quoted,
-    anymap,
-    # Julia passes
-    compact!,
-    ssa_inlining_pass!,
-    getfield_elim_pass!,
-    adce_pass!,
-    type_lift_pass!,
-    verify_linetable,
-    verify_ir,
-    slot2reg
-
-using Base.Meta: ParseError
-
-export Routine,
-    GenericRoutine,
-    IntrinsicRoutine,
-    RoutineSpec,
-    IntrinsicSpec,
-    @ctrl,
-    @measure,
-    @gate,
-    @barrier,
-    @device
-export routine_name
+@as_record Locations
+@as_record CtrlLocations
 
-include("compiler/patch.jl")
-include("compiler/routine.jl")
+include("compiler/types.jl")
+include("compiler/printing.jl")
 include("compiler/intrinsics.jl")
-include("compiler/qasm.jl")
+include("compiler/syntax.jl")
+include("compiler/interp.jl")
 
-using .QASM: @qasm_str
-export @qasm_str
+include("codegen/llvmopt.jl")
+include("codegen/native.jl")
 
-# compiler internal extensions
-include("compiler/interpreter.jl")
-include("compiler/codeinfo.jl")
-include("compiler/optimize.jl")
-
-# code generators
-include("compiler/codegen/codegen.jl")
-
-include("compiler/reflection.jl")
-include("compiler/utils.jl")
-# include("compiler/validation.jl")
-# include("compiler/trace.jl")
+# We have one global JIT and TM
+const orc = Ref{LLVM.OrcJIT}()
+const tm = Ref{LLVM.TargetMachine}()
 
 function __init__()
     TimerOutputs.reset_timer!(to)
-end
+    opt_level = Base.JLOptions().opt_level
+    if opt_level < 2
+        optlevel = LLVM.API.LLVMCodeGenLevelNone
+    elseif opt_level == 2
+        optlevel = LLVM.API.LLVMCodeGenLevelDefault
+    else
+        optlevel = LLVM.API.LLVMCodeGenLevelAggressive
+    end
 
-include("runtime/intrinsics.jl")
+    tm[] = LLVM.JITTargetMachine(; optlevel=optlevel)
+    LLVM.asm_verbosity!(tm[], true)
+
+    orc[] = LLVM.OrcJIT(tm[]) # takes ownership of tm
+    atexit() do
+        return LLVM.dispose(orc[])
+    end
+end
 
 end # module
@@ -0,0 +1,161 @@
+#####
+##### LLVM optimization pipeline
+#####
+
+# https://github.com/JuliaLang/julia/blob/2eb5da0e25756c33d1845348836a0a92984861ac/src/aotcompile.cpp#L603
+function addTargetPasses!(pm, tm)
+    add_library_info!(pm, LLVM.triple(tm))
+    return add_transform_info!(pm, tm)
+end
+
+# TODO (Missing C-API):
+#  - https://reviews.llvm.org/D86764 adds InstSimplify
+#  - createDivRemPairs
+#  - createLoopLoadEliminationPass
+#  - createVectorCombinePass
+# TODO (Missing LLVM.jl)
+#  - AggressiveInstCombinePass
+
+# https://github.com/JuliaLang/julia/blob/2eb5da0e25756c33d1845348836a0a92984861ac/src/aotcompile.cpp#L620
+function addOptimizationPasses!(pm, tm, opt_level, lower_intrinsics, dump_native)
+    constant_merge!(pm)
+    if opt_level < 2
+        error("opt_level less than 2 not supported")
+        return
+    end
+
+    propagate_julia_addrsp!(pm)
+    scoped_no_alias_aa!(pm)
+    type_based_alias_analysis!(pm)
+    if opt_level >= 3
+        basic_alias_analysis!(pm)
+    end
+    cfgsimplification!(pm)
+    dce!(pm)
+    scalar_repl_aggregates!(pm)
+
+    # mem_cpy_opt!(pm)
+
+    always_inliner!(pm) # Respect always_inline
+
+    # Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard time
+    # merging the `alloca` for the unboxed data and the `alloca` created by the `alloc_opt`
+    # pass.
+
+    alloc_opt!(pm)
+    # consider AggressiveInstCombinePass at optlevel > 2
+
+    instruction_combining!(pm)
+    cfgsimplification!(pm)
+    if dump_native
+        error("dump_native not supported")
+        # TODO: createMultiversoningPass
+    end
+    scalar_repl_aggregates!(pm)
+    instruction_combining!(pm) # TODO: createInstSimplifyLegacy
+    jump_threading!(pm)
+
+    reassociate!(pm)
+
+    early_cse!(pm)
+
+    # Load forwarding above can expose allocations that aren't actually used
+    # remove those before optimizing loops.
+    alloc_opt!(pm)
+    loop_rotate!(pm)
+    # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1)
+    loop_idiom!(pm)
+
+    # TODO: Polly (Quo vadis?)
+
+    # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards
+    lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop
+    licm!(pm)
+    julia_licm!(pm)
+    # Subsequent passes not stripping metadata from terminator
+    instruction_combining!(pm) # TODO: createInstSimplifyLegacy
+    ind_var_simplify!(pm)
+    loop_deletion!(pm)
+    loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll
+
+    # Run our own SROA on heap objects before LLVM's
+    alloc_opt!(pm)
+    # Re-run SROA after loop-unrolling (useful for small loops that operate,
+    # over the structure of an aggregate)
+    scalar_repl_aggregates!(pm)
+    instruction_combining!(pm) # TODO: createInstSimplifyLegacy
+
+    gvn!(pm)
+    mem_cpy_opt!(pm)
+    sccp!(pm)
+
+    # Run instcombine after redundancy elimination to exploit opportunities
+    # opened up by them.
+    # This needs to be InstCombine instead of InstSimplify to allow
+    # loops over Union-typed arrays to vectorize.
+    instruction_combining!(pm)
+    jump_threading!(pm)
+    dead_store_elimination!(pm)
+
+    # More dead allocation (store) deletion before loop optimization
+    # consider removing this:
+    alloc_opt!(pm)
+
+    # see if all of the constant folding has exposed more loops
+    # to simplification and deletion
+    # this helps significantly with cleaning up iteration
+    cfgsimplification!(pm)
+    loop_deletion!(pm)
+    instruction_combining!(pm)
+    loop_vectorize!(pm)
+    # TODO: createLoopLoadEliminationPass
+    cfgsimplification!(pm)
+    slpvectorize!(pm)
+    # might need this after LLVM 11:
+    # TODO: createVectorCombinePass()
+
+    aggressive_dce!(pm)
+
+    if lower_intrinsics
+        # LowerPTLS removes an indirect call. As a result, it is likely to trigger
+        # LLVM's devirtualization heuristics, which would result in the entire
+        # pass pipeline being re-exectuted. Prevent this by inserting a barrier.
+        barrier_noop!(pm)
+        lower_exc_handlers!(pm)
+        gc_invariant_verifier!(pm, false)
+        # Needed **before** LateLowerGCFrame on LLVM < 12
+        # due to bug in `CreateAlignmentAssumption`.
+        remove_ni!(pm)
+        late_lower_gc_frame!(pm)
+        final_lower_gc!(pm)
+        # We need these two passes and the instcombine below
+        # after GC lowering to let LLVM do some constant propagation on the tags.
+        # and remove some unnecessary write barrier checks.
+        gvn!(pm)
+        sccp!(pm)
+        # Remove dead use of ptls
+        dce!(pm)
+        lower_ptls!(pm, dump_native)
+        instruction_combining!(pm)
+        # Clean up write barrier and ptls lowering
+        cfgsimplification!(pm)
+    else
+        remove_ni!(pm)
+    end
+    return combine_mul_add!(pm)
+    # TODO: createDivRemPairs[]
+end
+
+function addMachinePasses!(pm, tm)
+    demote_float16!(pm)
+    return gvn!(pm)
+end
+
+function run_pipeline!(mod::LLVM.Module)
+    LLVM.ModulePassManager() do pm
+        addTargetPasses!(pm, tm[])
+        addOptimizationPasses!(pm, tm[], 3, true, false)
+        addMachinePasses!(pm, tm[])
+        return run!(pm, mod)
+    end
+end