diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 165bbe3..11f6028 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -35,7 +35,6 @@ jobs:
     env:
       CUDA_VISIBLE_DEVICES: 1
       JULIA_DEPOT_PATH: /scratch/github-actions/julia_depot_exaadmmm
-      JULIA_CUDA_USE_BINARYBUILDER: true
     runs-on: self-hosted
     strategy:
       matrix:
diff --git a/.gitignore b/.gitignore
index 90ed0f8..8789fd2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 Manifest.toml
 docs/build
 docs/Manifest.toml
+.vscode/settings.json
+Manifest.toml
diff --git a/Artifacts.toml b/Artifacts.toml
new file mode 100644
index 0000000..c12cd5c
--- /dev/null
+++ b/Artifacts.toml
@@ -0,0 +1,6 @@
+[ExaData]
+git-tree-sha1 = "df547b1e5d707fdb88721eee9bf84f9211e1fb50"
+lazy = true
+    [[ExaData.download]]
+    url = "https://web.cels.anl.gov/~mschanen/ExaData-e1b6123.tar.gz"
+    sha256 = "f6e22f26fbf273e97625a25b1b55f07d0eb0e15283483c545dda6eda84c48fdd"
diff --git a/Project.toml b/Project.toml
index fe0338d..4e10004 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "ExaAdmm"
 uuid = "4d6a948c-1075-4240-a564-361a5d4e22a2"
-authors = ["Youngdae Kim <youngdae@anl.gov>", "Kibaek Kim <kimk@anl.gov>", "Weiqi Zhang <wzhang483@wisc.edu>", "François Pacaud <fpacaud@anl.gov>", "Michel Schanen <mschanen@anl.gov>"]
-version = "0.4.0"
+authors = ["Youngdae Kim <youngdae@anl.gov>", "Kibaek Kim <kimk@anl.gov>", "Weiqi Zhang <wzhang483@wisc.edu>", "Bowen Li <libowen@umich.edu>", "François Pacaud <fpacaud@anl.gov>", "Michel Schanen <mschanen@anl.gov>"]
+version = "0.4.1"
 
 [deps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
@@ -19,7 +19,7 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
 AMDGPU = "0.4"
-CUDA = "4"
+CUDA = "4.1"
 ExaTron = "3"
 FileIO = "1.14"
 KernelAbstractions = "0.9"
diff --git a/src/ExaAdmm.jl b/src/ExaAdmm.jl
index 535caec..5897a09 100644
--- a/src/ExaAdmm.jl
+++ b/src/ExaAdmm.jl
@@ -27,6 +27,7 @@ include("utils/utilities_gpu.jl")
 include("utils/utilities_ka.jl")
 
 include("algorithms/admm_two_level.jl")
+include("algorithms/admm_one_level.jl")
 
 # ----------------------------------------
 # A single period ACOPF implementation
@@ -158,4 +159,37 @@ include("models/mpec/mpec_admm_update_residual_gpu.jl")
 include("models/mpec/mpec_admm_update_lz_gpu.jl")
 include("models/mpec/mpec_admm_prepoststep_gpu.jl")
 =#
+
+# Interface to use ADMM solving QPsub.
+include("interface/solve_qpsub.jl")
+
+# Define "struct ModelAcopf" for encapsulating an ACOPF model.
+include("models/qpsub/qpsub_model.jl")
+
+# CPU
+include("models/qpsub/qpsub_init_solution_cpu.jl")
+include("models/qpsub/qpsub_generator_kernel_cpu.jl")
+include("models/qpsub/qpsub_admm_update_x_cpu.jl")
+include("models/qpsub/qpsub_admm_update_xbar_cpu.jl")
+include("models/qpsub/qpsub_admm_update_l_single_cpu.jl")
+include("models/qpsub/qpsub_admm_update_residual_cpu.jl")
+include("models/qpsub/qpsub_admm_prepoststep_cpu.jl")
+include("models/qpsub/qpsub_eval_Ab_linelimit_kernel_cpu.jl")
+include("models/qpsub/qpsub_auglag_Ab_linelimit_kernel_red_cpu.jl")
+include("models/qpsub/qpsub_auglag_tron_linelimit_kernel_cpu.jl")
+
+# GPU
+include("models/qpsub/qpsub_init_solution_gpu.jl")
+include("models/qpsub/qpsub_generator_kernel_gpu.jl")
+include("models/qpsub/qpsub_admm_update_x_gpu.jl")
+include("models/qpsub/qpsub_admm_update_xbar_gpu.jl")
+include("models/qpsub/qpsub_admm_update_l_single_gpu.jl")
+include("models/qpsub/qpsub_admm_update_residual_gpu.jl")
+include("models/qpsub/qpsub_eval_Ab_linelimit_kernel_gpu.jl")
+include("models/qpsub/qpsub_auglag_Ab_linelimit_kernel_red_gpu.jl")
+include("models/qpsub/qpsub_tron_linelimit_kernel.jl")
+include("models/qpsub/qpsub_admm_prepoststep_gpu.jl")
+
+
+
 end # module
diff --git a/src/algorithms/admm_one_level.jl b/src/algorithms/admm_one_level.jl
new file mode 100644
index 0000000..97cf6d1
--- /dev/null
+++ b/src/algorithms/admm_one_level.jl
@@ -0,0 +1,80 @@
+function admm_one_level(
+    env::AdmmEnv, mod::AbstractOPFModel
+)
+    par = env.params
+    info = mod.info
+    sol = mod.solution
+
+    sqrt_d = sqrt(mod.nvar)
+    OUTER_TOL = sqrt_d*(par.outer_eps) #adjusted outer loop tolerance
+
+    fill!(info, 0)
+    info.mismatch = Inf
+
+    #eliminate second level
+    info.norm_z_prev = info.norm_z_curr = 0
+    par.initial_beta = 0
+    par.beta = 0
+    sol.lz .= 0
+    sol.z_curr .= 0
+    sol.z_prev .= 0
+    par.inner_iterlim = 1
+
+    if par.verbose > 0
+        admm_update_residual(env, mod)
+        @printf("%8s  %10s  %10s  %10s  %10s %10s %10s\n",
+                "Iter", "Objval", "Auglag", "PrimRes", "PrimTol", "DualRes", "DualTol")
+
+        @printf("%8d  %10.3e  %10.3e  %10.3e  %10.3e %10.3e  %10.3e\n",
+                info.outer, info.objval, info.auglag, info.mismatch, OUTER_TOL, info.dualres, OUTER_TOL*norm(sol.rho)/sqrt_d)
+    end
+
+    info.status = :IterationLimit
+
+    overall_time = @timed begin
+    while info.outer < par.outer_iterlim
+        admm_increment_outer(env, mod)
+
+
+        admm_increment_reset_inner(env, mod)
+        while info.inner < par.inner_iterlim
+            admm_increment_inner(env, mod)
+
+            admm_update_x(env, mod)
+
+            admm_update_xbar(env, mod)
+
+            admm_update_l_single(env, mod)
+
+            admm_update_residual(env, mod)
+
+            if par.verbose > 0
+                if (info.cumul % 50) == 0
+                    @printf("%8s  %10s  %10s  %10s  %10s  %10s  %10s\n",
+                            "Iter", "Objval", "Auglag", "PrimRes", "PrimTol", "DualRes", "DualTol")
+                end
+
+                @printf("%8d  %10.3e  %10.3e  %10.3e  %10.3e %10.3e  %10.3e\n",
+                        info.outer, info.objval, info.auglag, info.mismatch, OUTER_TOL, info.dualres, OUTER_TOL*norm(sol.rho)/sqrt_d)
+            end
+
+        end # while inner
+
+        # mismatch: x-xbar
+        if info.mismatch <= OUTER_TOL && info.dualres <= OUTER_TOL*norm(sol.rho)/sqrt_d
+            info.status = :Solved
+            break
+        end
+
+    end # while outer
+    end # @timed
+
+    info.time_overall = overall_time.time
+    admm_poststep(env, mod)
+
+    if par.verbose > 0
+        print_statistics(env, mod)
+    end
+
+    return
+end
diff --git a/src/algorithms/admm_two_level.jl b/src/algorithms/admm_two_level.jl
index 2f6d26f..559ffbc 100644
--- a/src/algorithms/admm_two_level.jl
+++ b/src/algorithms/admm_two_level.jl
@@ -5,7 +5,7 @@ function admm_two_level(
     info = mod.info
 
     sqrt_d = sqrt(mod.nvar)
-    OUTER_TOL = sqrt_d*(par.outer_eps)
+    OUTER_TOL = sqrt_d*(par.outer_eps) #adjusted outer loop tolerance
 
     fill!(info, 0)
     info.mismatch = Inf
@@ -41,6 +41,7 @@ function admm_two_level(
             admm_update_l(env, mod, device)
             admm_update_residual(env, mod, device)
 
+            # an adjusting termination criteria for inner loop (i.e., inner loop is not solved to exact)
             info.eps_pri = sqrt_d / (2500*info.outer)
 
             if par.verbose > 0
@@ -55,11 +56,13 @@ function admm_two_level(
                         info.dualres, info.norm_z_curr, info.mismatch, OUTER_TOL, par.beta)
             end
 
+            # primres: x-xbar+z_curr
             if info.primres <= info.eps_pri #|| info.dualres <= par.DUAL_TOL
                 break
             end
         end # while inner
 
+        # mismatch: x-xbar
         if info.mismatch <= OUTER_TOL
             info.status = :Solved
             break
@@ -67,6 +70,7 @@ function admm_two_level(
 
         admm_update_lz(env, mod, device)
 
+        # if z_curr too large vs z_prev, increase penalty
         if info.norm_z_curr > par.theta*info.norm_z_prev
             par.beta = min(par.inc_c*par.beta, 1e24)
         end
@@ -81,4 +85,4 @@ function admm_two_level(
     end
 
     return
-end
\ No newline at end of file
+end
diff --git a/src/interface/solve_acopf.jl b/src/interface/solve_acopf.jl
index d1b4e28..2ec036a 100644
--- a/src/interface/solve_acopf.jl
+++ b/src/interface/solve_acopf.jl
@@ -37,6 +37,8 @@ function solve_acopf(case::String;
     env.params.outer_eps = outer_eps
     env.params.outer_iterlim = outer_iterlim
     env.params.inner_iterlim = inner_iterlim
+    #memory allocation for CuDynamicSharedArray
+    env.params.shmem_size = sizeof(Float64)*(14*mod.n+3*mod.n^2) + sizeof(Int)*(4*mod.n)
 
     admm_two_level(env, mod, isa(ka_device, KA.CPU) ? nothing : ka_device)
 
diff --git a/src/interface/solve_pf.jl b/src/interface/solve_pf.jl
index e761590..dea6741 100644
--- a/src/interface/solve_pf.jl
+++ b/src/interface/solve_pf.jl
@@ -70,4 +70,4 @@ function solve_pf(pf::PowerFlow; tol=1e-6, max_iter=50)
     @printf("Elapsed time (secs). . . %8.2f\n", timed.time)
 
     return pf
-end
\ No newline at end of file
+end
diff --git a/src/interface/solve_qpsub.jl b/src/interface/solve_qpsub.jl
new file mode 100644
index 0000000..f4ede73
--- /dev/null
+++ b/src/interface/solve_qpsub.jl
@@ -0,0 +1,109 @@
+function solve_qpsub(
+    case::String,
+    Hs,
+    LH_1h,
+    RH_1h,
+    LH_1i,
+    RH_1i,
+    LH_1j,
+    RH_1j,
+    LH_1k,
+    RH_1k,
+    ls,
+    us,
+    pgmax,
+    pgmin,
+    qgmax,
+    qgmin,
+    c1,
+    c2,
+    Pd,
+    Qd,
+    initial_beta;
+    case_format = "matpower",
+    outer_iterlim = 20,
+    inner_iterlim = 1000,
+    rho_pq = 400.0,
+    rho_va = 40000.0,
+    obj_scale = 1.0,
+    scale = 1e-4,
+    storage_ratio = 0.0,
+    storage_charge_max = 1.0,
+    use_gpu = false,
+    use_linelimit = true,
+    use_projection = false,
+    tight_factor = 1.0,
+    outer_eps = 2 * 1e-4,
+    gpu_no = 0,
+    verbose = 1,
+    onelevel = true,
+)
+    T = Float64
+    TD = Array{Float64,1}
+    TI = Array{Int,1}
+    TM = Array{Float64,2}
+    if use_gpu
+        CUDA.device!(gpu_no)
+        TD = CuArray{Float64,1}
+        TI = CuArray{Int,1}
+        TM = CuArray{Float64,2}
+    end
+
+    env = AdmmEnv{T,TD,TI,TM}(
+        case,
+        rho_pq,
+        rho_va;
+        case_format = case_format,
+        use_gpu = use_gpu,
+        use_linelimit = use_linelimit,
+        use_projection = use_projection,
+        tight_factor = tight_factor,
+        gpu_no = gpu_no,
+        storage_ratio = storage_ratio,
+        storage_charge_max = storage_charge_max,
+        verbose = verbose,
+    )
+    mod = ModelQpsub{T,TD,TI,TM}(env)
+
+    data = mod.grid_data
+
+    mod.Hs = copy(Hs)
+    mod.LH_1h = copy(LH_1h)
+    mod.RH_1h = copy(RH_1h)
+    mod.LH_1i = copy(LH_1i)
+    mod.RH_1i = copy(RH_1i)
+    mod.LH_1j = copy(LH_1j)
+    mod.RH_1j = copy(RH_1j)
+    mod.LH_1k = copy(LH_1k)
+    mod.RH_1k = copy(RH_1k)
+    mod.ls = copy(ls)
+    mod.us = copy(us)
+
+    mod.qpsub_pgmax = copy(pgmax)
+    mod.qpsub_pgmin = copy(pgmin)
+    mod.qpsub_qgmax = copy(qgmax)
+    mod.qpsub_qgmin = copy(qgmin)
+
+    mod.qpsub_c1 = copy(c1)
+    mod.qpsub_c2 = copy(c2)
+    mod.qpsub_Pd = copy(Pd)
+    mod.qpsub_Qd = copy(Qd)
+
+    env.params.scale = scale
+    env.params.obj_scale = obj_scale
+    env.params.outer_eps = outer_eps
+    env.params.outer_iterlim = outer_iterlim
+    env.params.inner_iterlim = inner_iterlim
+    env.params.shmem_size = sizeof(Float64) * (16 * mod.n + 4 * mod.n^2 + 178) + sizeof(Int) * (4 * mod.n)
+    env.params.initial_beta = initial_beta #use my initial beta 
+
+    init_solution!(mod, mod.solution, env.initial_rho_pq, env.initial_rho_va)
+
+    #one-level or two-level admm 
+    if onelevel
+        admm_one_level(env, mod)
+    else
+        @warn "two-level ADMM is not implemented in QPsub"
+    end
+    return env, mod
+end
diff --git a/src/models/qpsub/qpsub_admm_increment.jl b/src/models/qpsub/qpsub_admm_increment.jl
new file mode 100644
index 0000000..69947af
--- /dev/null
+++ b/src/models/qpsub/qpsub_admm_increment.jl
@@ -0,0 +1,44 @@
+"""
+    admm_increment_outer()
+
+increment outer iteration
+"""
+
+function admm_increment_outer(
+    env::AdmmEnv,
+    mod::ModelQpsub
+)
+    mod.info.outer += 1
+    return
+end
+
+
+"""
+    admm_increment_reset_inner()
+
+reset inner iteration when outer iteration just incremented
+"""
+
+function admm_increment_reset_inner(
+    env::AdmmEnv,
+    mod::ModelQpsub
+)
+    mod.info.inner = 0
+    return
+end
+
+
+"""
+    admm_increment_inner()
+
+increment inner iteration
+"""
+
+function admm_increment_inner(
+    env::AdmmEnv,
+    mod::ModelQpsub
+)
+    mod.info.inner += 1
+    mod.info.cumul += 1
+    return
+end
diff --git a/src/models/qpsub/qpsub_admm_prepoststep_cpu.jl b/src/models/qpsub/qpsub_admm_prepoststep_cpu.jl
new file mode 100644
index 0000000..6a0a834
--- /dev/null
+++ b/src/models/qpsub/qpsub_admm_prepoststep_cpu.jl
@@ -0,0 +1,83 @@
+"""
+    admm_poststep()
+
+- after admm termination, fix solution pf_projection() and record time mod.info.time_projection
+- update info.objval and info.auglag with projected solution
+- feed step solution, multipliers and KKT error info to SQOPF
+"""
+
+function admm_poststep(
+    env::AdmmEnv{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
+    mod::ModelQpsub{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}}
+)
+    par, data, sol, info, grid_data = env.params, env.data, mod.solution, mod.info, mod.grid_data
+
+    if env.use_projection
+        time_projection = @timed pf_projection(env, mod)
+        mod.info.time_projection = time_projection.time
+    end
+
+    #final objective and auglag
+    info.objval = sum(mod.qpsub_c2[g]*(grid_data.baseMVA*sol.u_curr[mod.gen_start+2*(g-1)])^2 +
+                    mod.qpsub_c1[g]*(grid_data.baseMVA*sol.u_curr[mod.gen_start+2*(g-1)])
+                    for g in 1:grid_data.ngen) +
+                        sum(0.5*dot(mod.sqp_line[:,l],mod.Hs[6*(l-1)+1:6*l,1:6],mod.sqp_line[:,l]) for l=1:grid_data.nline)
+
+    info.auglag = info.objval + sum(sol.lz[i]*sol.z_curr[i] for i=1:mod.nvar) +
+                        0.5*par.beta*sum(sol.z_curr[i]^2 for i=1:mod.nvar) +
+                        sum(sol.l_curr[i]*sol.rp[i] for i=1:mod.nvar) +
+                        0.5*sum(sol.rho[i]*(sol.rp[i])^2 for i=1:mod.nvar)
+
+
+    #find dual infeas kkt for SQP integration
+    pg_dual_infeas = [2*mod.qpsub_c2[g]*(grid_data.baseMVA)^2*sol.u_curr[mod.gen_start+2*(g-1)] for g = 1:grid_data.ngen]
+    line_dual_infeas = vcat([mod.Hs[6*(l-1)+1:6*l,1:6] * mod.sqp_line[:,l] for l = 1:grid_data.nline]...)
+    mod.dual_infeas = vcat(pg_dual_infeas, line_dual_infeas) #unscale
+
+    #assign value to step variable; due to inexact steps, other options include use u_curr, v_curr alone or average
+    #generation
+    @inbounds begin
+        for g = 1: grid_data.ngen
+            pg_idx = mod.gen_start + 2*(g-1)
+            qg_idx = mod.gen_start + 2*(g-1) + 1
+            mod.dpg_sol[g] = sol.u_curr[pg_idx]
+            mod.dqg_sol[g] = sol.u_curr[qg_idx]
+        end
+
+        mod.dline_var = copy(mod.sqp_line)
+
+        for l = 1:grid_data.nline
+            shift_idx = mod.line_start + 8*(l-1)
+            mod.dline_fl[1,l] = sol.u_curr[shift_idx] #pij
+            mod.dline_fl[2,l] = sol.u_curr[shift_idx + 1] #qij
+            mod.dline_fl[3,l] = sol.u_curr[shift_idx + 2] #pji
+            mod.dline_fl[4,l] = sol.u_curr[shift_idx + 3] #qji
+        end
+
+        for b = 1: grid_data.nbus
+            dw_ct = 0
+            dw_sum = 0
+            dt_sum = 0
+            dt_ct = 0
+            if grid_data.FrStart[b] < grid_data.FrStart[b+1]
+                for k = grid_data.FrStart[b]:grid_data.FrStart[b+1]-1
+                    dw_sum  += mod.dline_var[3, grid_data.FrIdx[k]] #wi(ij)
+                    dw_ct += 1
+                    dt_sum += mod.dline_var[5, grid_data.FrIdx[k]] #ti(ij)
+                    dt_ct += 1
+                end
+            end
+            if grid_data.ToStart[b] < grid_data.ToStart[b+1]
+                for k = grid_data.ToStart[b]:grid_data.ToStart[b+1]-1
+                    dw_sum += mod.dline_var[4, grid_data.ToIdx[k]] #wj(ji)
+                    dt_sum += mod.dline_var[6, grid_data.ToIdx[k]] #tj(ji)
+                    dw_ct += 1
+                    dt_ct += 1
+                end
+            end
+            mod.dw_sol[b] = dw_sum/dw_ct #average
+            mod.dtheta_sol[b] = dt_sum/dt_ct #average
+        end
+    end #inbound
+    return
+end
diff --git a/src/models/qpsub/qpsub_admm_prepoststep_gpu.jl b/src/models/qpsub/qpsub_admm_prepoststep_gpu.jl
new file mode 100644
index 0000000..56d9899
--- /dev/null
+++ b/src/models/qpsub/qpsub_admm_prepoststep_gpu.jl
@@ -0,0 +1,105 @@
+"""
+    admm_poststep()
+
+- after admm termination, fix solution pf_projection() and record time mod.info.time_projection
+- update info.objval and info.auglag with projected solution
+- feed step solution, multipliers and KKT error info to SQOPF
+"""
+
+function admm_poststep(
+    env::AdmmEnv{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
+    mod::ModelQpsub{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}}
+)
+    par, data, sol, info, grid_data = env.params, env.data, mod.solution, mod.info, mod.grid_data
+
+    if env.use_projection
+        time_projection = @timed pf_projection(env, mod)
+        mod.info.time_projection = time_projection.time
+    end
+
+    u_curr = zeros(mod.nvar)
+    copyto!(u_curr, sol.u_curr)
+
+    l_curr = Array(sol.l_curr)
+    rho = Array(sol.rho)
+    rp = Array(sol.rp)
+
+    c2 = zeros(grid_data.ngen)
+    copyto!(c2, mod.qpsub_c2)
+
+    c1 = zeros(grid_data.ngen)
+    copyto!(c1, mod.qpsub_c1)
+
+    sqp_line = Array(mod.sqp_line)
+
+    FrStart = Array(grid_data.FrStart)
+    FrIdx = Array(grid_data.FrIdx)
+
+    ToStart = Array(grid_data.ToStart)
+    ToIdx = Array(grid_data.ToIdx)
+
+    Hs = Array(mod.Hs)
+
+    #final objective and auglag
+    info.objval = sum(c2[g]*(grid_data.baseMVA*u_curr[mod.gen_start+2*(g-1)])^2 +
+                    c1[g]*(grid_data.baseMVA*u_curr[mod.gen_start+2*(g-1)])
+                    for g in 1:grid_data.ngen) +
+                        sum(0.5*dot(sqp_line[:,l],Hs[6*(l-1)+1:6*l,1:6],sqp_line[:,l]) for l=1:grid_data.nline)
+
+    info.auglag = info.objval +
+                        sum(l_curr[i]*rp[i] for i=1:mod.nvar) +
+                        0.5*sum(rho[i]*(rp[i])^2 for i=1:mod.nvar)
+
+
+    #find dual infeas kkt for SQP integration
+    pg_dual_infeas = [2*c2[g]*(grid_data.baseMVA)^2*u_curr[mod.gen_start+2*(g-1)] for g = 1:grid_data.ngen]
+    line_dual_infeas = vcat([Hs[6*(l-1)+1:6*l,1:6] * sqp_line[:,l] for l = 1:grid_data.nline]...)
+    mod.dual_infeas = vcat(pg_dual_infeas, line_dual_infeas) #unscale
+
+    #assign value to step variable; due to inexact steps, other options include use u_curr, v_curr alone or average
+    #generation
+    @inbounds begin
+        for g = 1: grid_data.ngen
+            pg_idx = mod.gen_start + 2*(g-1)
+            qg_idx = mod.gen_start + 2*(g-1) + 1
+            mod.dpg_sol[g] = u_curr[pg_idx]
+            mod.dqg_sol[g] = u_curr[qg_idx]
+        end
+
+        copyto!(mod.dline_var, sqp_line)
+
+        for l = 1:grid_data.nline
+            shift_idx = mod.line_start + 8*(l-1)
+            mod.dline_fl[1,l] = u_curr[shift_idx] #pij
+            mod.dline_fl[2,l] = u_curr[shift_idx + 1] #qij
+            mod.dline_fl[3,l] = u_curr[shift_idx + 2] #pji
+            mod.dline_fl[4,l] = u_curr[shift_idx + 3] #qji
+        end
+
+        for b = 1: grid_data.nbus
+            dw_ct = 0
+            dw_sum = 0
+            dt_sum = 0
+            dt_ct = 0
+            if FrStart[b] < FrStart[b+1]
+                for k = FrStart[b]:FrStart[b+1]-1
+                    dw_sum  += sqp_line[3, FrIdx[k]] #wi(ij)
+                    dw_ct += 1
+                    dt_sum += sqp_line[5, FrIdx[k]] #ti(ij)
+                    dt_ct += 1
+                end
+            end
+            if ToStart[b] < ToStart[b+1]
+                for k = ToStart[b]:ToStart[b+1]-1
+                    dw_sum += sqp_line[4, ToIdx[k]] #wj(ji)
+                    dt_sum += sqp_line[6, ToIdx[k]] #tj(ji)
+                    dw_ct += 1
+                    dt_ct += 1
+                end
+            end
+            mod.dw_sol[b] = dw_sum/dw_ct #average
+            mod.dtheta_sol[b] = dt_sum/dt_ct #average
+        end
+    end #inbound
+    return
+end
diff --git a/src/models/qpsub/qpsub_admm_update_l_single_cpu.jl b/src/models/qpsub/qpsub_admm_update_l_single_cpu.jl
new file mode 100644
index 0000000..9de9c21
--- /dev/null
+++ b/src/models/qpsub/qpsub_admm_update_l_single_cpu.jl
@@ -0,0 +1,18 @@
+"""
+    admm_update_l()
+
+- update l
+- record time info.time_l_update
+- only used in one-level ADMM
+"""
+
+function admm_update_l_single(
+    env::AdmmEnv{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
+    mod::ModelQpsub{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}}
+)
+    par, sol, info = env.params, mod.solution, mod.info
+    sol.l_prev = sol.l_curr
+    ltime = @timed sol.l_curr .= sol.l_prev + sol.rho .* (sol.u_curr-sol.v_curr)
+    info.time_l_update += ltime.time
+    return
+end
diff --git a/src/models/qpsub/qpsub_admm_update_l_single_gpu.jl b/src/models/qpsub/qpsub_admm_update_l_single_gpu.jl
new file mode 100644
index 0000000..9d07703
--- /dev/null
+++ b/src/models/qpsub/qpsub_admm_update_l_single_gpu.jl
@@ -0,0 +1,37 @@
+"""
+    admm_update_l()
+
+- update l
+- record time info.time_l_update
+- only used in one-level ADMM
+- GPU kernel: update_l_kernel_single
+"""
+
+
+function update_l_kernel_single(
+    n::Int, l_curr::CuDeviceArray{Float64,1}, l_prev::CuDeviceArray{Float64,1}, u::CuDeviceArray{Float64,1},
+    v::CuDeviceArray{Float64,1}, rho::CuDeviceArray{Float64,1}
+    )
+    tx = threadIdx().x + (blockDim().x * (blockIdx().x - 1))
+
+    if tx <= n
+        @inbounds begin
+            l_curr[tx] = l_prev[tx] + rho[tx] * (u[tx] - v[tx])
+        end
+    end
+
+    return
+end
+
+
+
+function admm_update_l_single(
+    env::AdmmEnv{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
+    mod::ModelQpsub{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}}
+    )
+    par, sol, info = env.params, mod.solution, mod.info
+    sol.l_prev = sol.l_curr
+    ltime = CUDA.@timed @cuda threads=64 blocks=(div(mod.nvar-1, 64)+1) update_l_kernel_single(mod.nvar, sol.l_curr, sol.l_prev, sol.u_curr, sol.v_curr, sol.rho)
+    info.time_l_update += ltime.time
+    return
+end
diff --git a/src/models/qpsub/qpsub_admm_update_residual_cpu.jl b/src/models/qpsub/qpsub_admm_update_residual_cpu.jl
new file mode 100644
index 0000000..0c91a0f
--- /dev/null
+++ b/src/models/qpsub/qpsub_admm_update_residual_cpu.jl
@@ -0,0 +1,39 @@
+"""
+    admm_update_residual()
+
+- compute termination errors and other info
+- update info.primres, info.dualres, info.mismatch, info.objval, info.auglag
+- update sol.rp, sol.rd, sol.Ax_plus_By
+- only used in one-level ADMM
+"""
+
+function admm_update_residual(
+    env::AdmmEnv{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
+    mod::ModelQpsub{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}}
+)
+    sol, info, data, par, grid_data = mod.solution, mod.info, env.data, env.params, mod.grid_data
+
+
+    sol.rp .= sol.u_curr .- sol.v_curr #u-v
+    sol.rd .= sol.rho .* (sol.v_curr - mod.v_prev)#from Boyd's single-level admm
+    sol.Ax_plus_By .= sol.rp #x-xbar
+
+    info.primres = norm(sol.rp)
+    info.dualres = norm(sol.rd)
+    info.mismatch = norm(sol.Ax_plus_By)
+
+
+    info.objval = sum(mod.qpsub_c2[g]*(grid_data.baseMVA*sol.u_curr[mod.gen_start+2*(g-1)])^2 +
+                        mod.qpsub_c1[g]*(grid_data.baseMVA*sol.u_curr[mod.gen_start+2*(g-1)])
+                        for g in 1:grid_data.ngen) +
+                            sum(0.5*dot(mod.sqp_line[:,l],mod.Hs[6*(l-1)+1:6*l,1:6],mod.sqp_line[:,l]) for l=1:grid_data.nline)
+
+    info.auglag = info.objval + sum(sol.lz[i]*sol.z_curr[i] for i=1:mod.nvar) +
+                  0.5*par.beta*sum(sol.z_curr[i]^2 for i=1:mod.nvar) +
+                  sum(sol.l_curr[i]*sol.rp[i] for i=1:mod.nvar) +
+                  0.5*sum(sol.rho[i]*(sol.rp[i])^2 for i=1:mod.nvar)
+
+
+
+    return
+end
diff --git a/src/models/qpsub/qpsub_admm_update_residual_gpu.jl b/src/models/qpsub/qpsub_admm_update_residual_gpu.jl
new file mode 100644
index 0000000..f59192c
--- /dev/null
+++ b/src/models/qpsub/qpsub_admm_update_residual_gpu.jl
@@ -0,0 +1,52 @@
+"""
+    admm_update_residual()
+
+- compute termination errors and other info
+- update info.primres, info.dualres, info.mismatch
+- update sol.rp, sol.rd, sol.Ax_plus_By
+- only used in one-level ADMM
+- GPU kernel: compute_primal_residual_kernel_qpsub, compute_dual_residual_kernel_qpsub, copy_data_kernel
+"""
+
+function compute_primal_residual_kernel_qpsub(n::Int, rp::CuDeviceArray{Float64,1}, u::CuDeviceArray{Float64,1}, v::CuDeviceArray{Float64,1})
+    tx = threadIdx().x + (blockDim().x * (blockIdx().x - 1))
+
+    if tx <= n
+        rp[tx] = u[tx] - v[tx] #u-v or x-xbar
+    end
+
+    return
+end
+
+function compute_dual_residual_kernel_qpsub(n::Int, rd::CuDeviceArray{Float64,1}, v_curr::CuDeviceArray{Float64,1}, v_prev::CuDeviceArray{Float64,1}, rho)
+    tx = threadIdx().x + (blockDim().x * (blockIdx().x - 1))
+
+    if tx <= n
+        rd[tx] = rho[tx] * (v_curr[tx] - v_prev[tx]) #from Boyd's single-level admm
+    end
+
+    return
+end
+
+
+function admm_update_residual(
+    env::AdmmEnv{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
+    mod::ModelQpsub{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}}
+)
+    sol, info, data, par, grid_data = mod.solution, mod.info, env.data, env.params, mod.grid_data
+
+    @cuda threads=64 blocks=(div(mod.nvar-1, 64)+1) compute_primal_residual_kernel_qpsub(mod.nvar, sol.rp, sol.u_curr, sol.v_curr)
+
+    @cuda threads=64 blocks=(div(mod.nvar-1, 64)+1) compute_dual_residual_kernel_qpsub(mod.nvar, sol.rd, sol.v_curr, mod.v_prev, sol.rho)
+
+    @cuda threads=64 blocks=(div(mod.nvar-1, 64)+1) copy_data_kernel(mod.nvar, sol.Ax_plus_By, sol.rp) # from gpu utility
+
+
+    info.primres = CUDA.norm(sol.rp)
+
+    info.dualres = CUDA.norm(sol.rd)
+
+    info.mismatch = CUDA.norm(sol.Ax_plus_By)
+
+    return
+end
diff --git a/src/models/qpsub/qpsub_admm_update_x_cpu.jl b/src/models/qpsub/qpsub_admm_update_x_cpu.jl
new file mode 100644
index 0000000..d2d1b4b
--- /dev/null
+++ b/src/models/qpsub/qpsub_admm_update_x_cpu.jl
@@ -0,0 +1,51 @@
+"""
+    acopf_admm_update_x_line()
+
+- update xline: call auglag_linelimit_two_level_alternative_qpsub() = update sol.x[pij_idx]
+- record run time info.user.time_branches, info.time_x_update
+"""
+
+function acopf_admm_update_x_line(
+    env::AdmmEnv{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
+    mod::ModelQpsub{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}}
+    )
+    par, sol, info, data = env.params, mod.solution, mod.info, mod.grid_data
+
+@inbounds begin
+    for i = 1 : mod.grid_data.nline
+        shift_idx = mod.line_start + 8*(i-1)
+        A_ipopt = eval_A_branch_kernel_cpu_qpsub(mod.Hs[6*(i-1)+1:6*i,1:6], sol.l_curr[shift_idx : shift_idx + 7],
+        sol.rho[shift_idx : shift_idx + 7], sol.v_curr[shift_idx : shift_idx + 7],
+        sol.z_curr[shift_idx : shift_idx + 7],
+        mod.grid_data.YffR[i], mod.grid_data.YffI[i],
+        mod.grid_data.YftR[i], mod.grid_data.YftI[i],
+        mod.grid_data.YttR[i], mod.grid_data.YttI[i],
+        mod.grid_data.YtfR[i], mod.grid_data.YtfI[i])
+
+        b_ipopt = eval_b_branch_kernel_cpu_qpsub(sol.l_curr[shift_idx : shift_idx + 7],
+        sol.rho[shift_idx : shift_idx + 7], sol.v_curr[shift_idx : shift_idx + 7],
+        sol.z_curr[shift_idx : shift_idx + 7],
+        mod.grid_data.YffR[i], mod.grid_data.YffI[i],
+        mod.grid_data.YftR[i], mod.grid_data.YftI[i],
+        mod.grid_data.YttR[i], mod.grid_data.YttI[i],
+        mod.grid_data.YtfR[i], mod.grid_data.YtfI[i], mod.line_res[:,i])
+
+        #call reduced branch kernel
+        time_br = @timed tronx, tronf = auglag_Ab_linelimit_two_level_alternative_qpsub_ij_red(info.inner, par.max_auglag, par.mu_max, par.scale, A_ipopt, b_ipopt, mod.ls[i,:], mod.us[i,:], mod.sqp_line, sol.l_curr[shift_idx : shift_idx + 7],
+        sol.rho[shift_idx : shift_idx + 7], sol.u_curr, shift_idx, sol.v_curr[shift_idx : shift_idx + 7],
+        sol.z_curr[shift_idx : shift_idx + 7], mod.qpsub_membuf,i,
+        mod.grid_data.YffR[i], mod.grid_data.YffI[i],
+        mod.grid_data.YftR[i], mod.grid_data.YftI[i],
+        mod.grid_data.YttR[i], mod.grid_data.YttI[i],
+        mod.grid_data.YtfR[i], mod.grid_data.YtfI[i],
+        mod.LH_1h[i,:], mod.RH_1h[i], mod.LH_1i[i,:], mod.RH_1i[i], mod.LH_1j[i,:], mod.RH_1j[i], mod.LH_1k[i,:], mod.RH_1k[i], mod.lambda, mod.line_res[:,i])
+
+
+
+    info.user.time_branches += time_br.time
+    info.time_x_update += time_br.time
+    end
+end #@inbounds
+
+return
+end
diff --git a/src/models/qpsub/qpsub_admm_update_x_gpu.jl b/src/models/qpsub/qpsub_admm_update_x_gpu.jl
new file mode 100644
index 0000000..61701af
--- /dev/null
+++ b/src/models/qpsub/qpsub_admm_update_x_gpu.jl
@@ -0,0 +1,26 @@
+"""
+    acopf_admm_update_x_line()
+
+- update xline: call auglag_linelimit_qpsub() = update sol.x[pij_idx]
+- record run time info.user.time_branches, info.time_x_update
+"""
+
+function acopf_admm_update_x_line(
+    env::AdmmEnv{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
+    mod::ModelQpsub{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}}
+    )
+    par, sol, info, data = env.params, mod.solution, mod.info, mod.grid_data
+
+    shmem_size = env.params.shmem_size
+
+    time_br = CUDA.@timed @cuda threads=32 blocks=data.nline shmem=shmem_size auglag_linelimit_qpsub(mod.Hs, sol.l_curr, sol.rho, sol.u_curr, sol.v_curr, sol.z_curr, mod.grid_data.YffR, mod.grid_data.YffI,
+    mod.grid_data.YftR, mod.grid_data.YftI,
+    mod.grid_data.YttR, mod.grid_data.YttI,
+    mod.grid_data.YtfR, mod.grid_data.YtfI, info.inner, par.max_auglag, par.mu_max, par.scale, mod.ls, mod.us, mod.sqp_line,
+    mod.qpsub_membuf, mod.LH_1h, mod.RH_1h, mod.LH_1i, mod.RH_1i, mod.LH_1j, mod.RH_1j, mod.LH_1k, mod.RH_1k, mod.lambda, mod.line_start, mod.grid_data.nline, mod.supY, mod.line_res)
+
+    info.user.time_branches += time_br.time
+    info.time_x_update += time_br.time
+
+return
+end
diff --git a/src/models/qpsub/qpsub_admm_update_xbar_cpu.jl b/src/models/qpsub/qpsub_admm_update_xbar_cpu.jl
new file mode 100644
index 0000000..6a48073
--- /dev/null
+++ b/src/models/qpsub/qpsub_admm_update_xbar_cpu.jl
@@ -0,0 +1,23 @@
+"""
+    admm_update_xbar()
+
+- update xbar: call bus_kernel_two_level_alternative_qpsub() = update sol.v (full)
+- record run time info.time_xbar_update, info.user.time_buses
+"""
+
+
+
+function admm_update_xbar(
+    env::AdmmEnv{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
+    mod::ModelQpsub{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}}
+)
+sol, info, data = mod.solution, mod.info, mod.grid_data
+mod.v_prev .= sol.v_curr
+bus_time = @timed bus_kernel_two_level_alternative(data.baseMVA, data.nbus, mod.gen_start, mod.line_start,
+                                    data.FrStart, data.FrIdx, data.ToStart, data.ToIdx, data.GenStart,
+                                    data.GenIdx, mod.qpsub_Pd, mod.qpsub_Qd, sol.u_curr, sol.v_curr, sol.z_curr,
+                                    sol.l_curr, sol.rho, data.YshR, data.YshI)
+info.time_xbar_update += bus_time.time
+    info.user.time_buses += bus_time.time
+    return
+end
diff --git a/src/models/qpsub/qpsub_admm_update_xbar_gpu.jl b/src/models/qpsub/qpsub_admm_update_xbar_gpu.jl
new file mode 100644
index 0000000..8b6d40b
--- /dev/null
+++ b/src/models/qpsub/qpsub_admm_update_xbar_gpu.jl
@@ -0,0 +1,27 @@
+"""
+    admm_update_xbar()
+
+- update xbar: call bus_kernel_two_level_alternative_qpsub() = update sol.v (full)
+- record run time info.time_xbar_update, info.user.time_buses
+"""
+
+
+
+function admm_update_xbar(
+    env::AdmmEnv{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
+    mod::ModelQpsub{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}}
+    )
+    sol, info, data = mod.solution, mod.info, mod.grid_data
+
+    nblk_bus = div(data.nbus, 32, RoundUp)
+
+    @cuda threads=64 blocks=(div(mod.nvar-1, 64)+1) copy_data_kernel(mod.nvar, mod.v_prev, sol.v_curr)
+
+    bus_time = CUDA.@timed @cuda threads=32 blocks=nblk_bus bus_kernel_two_level_alternative(data.baseMVA, data.nbus, mod.gen_start, mod.line_start,
+                                    data.FrStart, data.FrIdx, data.ToStart, data.ToIdx, data.GenStart,
+                                    data.GenIdx, mod.qpsub_Pd, mod.qpsub_Qd, sol.u_curr, sol.v_curr, sol.z_curr,
+                                    sol.l_curr, sol.rho, data.YshR, data.YshI)
+    info.time_xbar_update += bus_time.time
+    info.user.time_buses += bus_time.time
+    return
+end
diff --git a/src/models/qpsub/qpsub_auglag_Ab_linelimit_kernel_red_cpu.jl b/src/models/qpsub/qpsub_auglag_Ab_linelimit_kernel_red_cpu.jl
new file mode 100644
index 0000000..c1603f7
--- /dev/null
+++ b/src/models/qpsub/qpsub_auglag_Ab_linelimit_kernel_red_cpu.jl
@@ -0,0 +1,157 @@
+"""
+    auglag_linelimit_two_level_alternative_qpsub_ij_red()
+
+- for certain line (i,j), update sol.u[pij_idx]
+- use Exatron, eval_A_auglag_branch_kernel_cpu_qpsub, eval_b_auglag_branch_kernel_cpu_qpsub, build_QP_DS
+- LANCELOT ALM algorithm
+- with elimination of w_ijR and w_ijI (v2 in overleaf)
+- with multiplier output
+"""
+
+
+"""
+   Internal Solution Structure for branch
+
+- branch structure from u (8*nline):
+    - |p_ij   | q_ij  | p_ji   | q_ji    | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji) |
+
+- branch structure for Exatron (8*nline):
+    - | t_ij(linelimit) | t_ji(linelimit) | w_ijR  |  w_ijI   | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji)
+
+- branch structure for Exatron (6*nline): eliminate w_ijR, wij_I
+    - | t_ij(linelimit) | t_ji(linelimit) | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji)
+
+- Hessian inherited from SQP ie sqp_line (6*nline):
+    - |w_ijR  | w_ijI |  wi(ij) | wj(ji) |  thetai(ij) |  thetaj(ji)|
+"""
+
+
+function auglag_Ab_linelimit_two_level_alternative_qpsub_ij_red(
+    major_iter::Int, max_auglag::Int, mu_max::Float64, scale::Float64,
+    Hbr::Array{Float64,2}, bbr::Array{Float64,1}, lqp::Array{Float64,1}, uqp::Array{Float64,1},sqp_line::Array{Float64,2},
+    l::Array{Float64,1}, rho::Array{Float64,1},
+    u::Array{Float64,1}, shift_idx::Int, v::Array{Float64,1}, z_curr::Array{Float64,1}, membuf::Array{Float64,2},lineidx::Int,
+    YffR::Float64, YffI::Float64,
+    YftR::Float64, YftI::Float64,
+    YttR::Float64, YttI::Float64,
+    YtfR::Float64, YtfI::Float64,
+    LH_1h::Array{Float64,1}, RH_1h::Float64,
+    LH_1i::Array{Float64,1}, RH_1i::Float64, LH_1j::Array{Float64,1},RH_1j::Float64, LH_1k::Array{Float64,1},RH_1k::Float64, lambda::Array{Float64,2},  line_res::Array{Float64,1})
+
+
+    # variable wrt branch structure wrt Exatron
+    x = [0.0; 0.0; sqp_line[3:6,lineidx]] #initialization
+    f = 0.0
+    xl = [0.0; 0.0; lqp[3:6]]
+    xu = [200000.0; 200000.0; uqp[3:6]]
+    trg = zeros(6) #hold multiplier
+
+    Ctron = zeros(8,6)
+    dtron = zeros(8)
+
+    # initialization on penalty
+    if major_iter == 1 #info.inner = 1 (first inner iteration)
+        membuf[5,lineidx] = 10.0 #reset ρ_1h = ρ_1i = ρ_1j = ρ_1k (let ρ the same for all AL terms)
+        mu = 10.0 # set mu = initial ρ_*
+    else
+        mu = membuf[5,lineidx] #inherit mu = ρ from the previous inner iteration
+    end
+
+
+    # set internal parameters eta omega mu to guide ALM convergence
+    eta = 1 / mu^0.1
+
+
+
+    it = 0 #iteration count
+    terminate = false #termination status
+
+    #pij qij pji qji wrt branch structure ExaTron
+    supY = [0 0 YftR YftI YffR 0 0 0;
+        0 0 -YftI YftR -YffI 0 0 0;
+        0 0 YtfR -YtfI 0 YttR 0 0;
+        0 0 -YtfI -YtfR 0 -YttI 0 0]
+
+    #ALM on equality constraint wrt branch structure ExaTron
+    vec_1j = [1, 0, 0, 0, 0, 0, 0, 0] + LH_1j[1]* supY[1,:] + LH_1j[2]* supY[2,:] #1j with t_ij
+    vec_1k = [0, 1, 0, 0, 0, 0, 0, 0] + LH_1k[1]* supY[3,:] + LH_1k[2]* supY[4,:] #1k with t_ji
+
+
+        while !terminate
+            it += 1
+
+            #create QP parameters SCALED
+            Atron, Atron_red = eval_A_auglag_branch_kernel_cpu_qpsub_red(Hbr,l, rho, v, z_curr, membuf[:,lineidx],
+            YffR, YffI, YftR, YftI, YttR, YttI, YtfR, YtfI,
+            LH_1h, RH_1h, LH_1i, RH_1i, LH_1j, RH_1j, LH_1k, RH_1k, scale)
+            Ctron, dtron, btron_red =  eval_b_auglag_branch_kernel_cpu_qpsub_red(Atron, bbr,l, rho, v, z_curr, membuf[:,lineidx],
+            YffR, YffI, YftR, YftI, YttR, YttI, YtfR, YtfI,
+            LH_1h, RH_1h, LH_1i, RH_1i, LH_1j, RH_1j, LH_1k, RH_1k, scale)
+
+
+            tron = build_QP_DS(Atron_red,btron_red,xl,xu)
+            tron.x .= x #initialization on tron
+
+            # Solve the branch problem.
+            status = ExaTron.solveProblem(tron)
+            x .= tron.x
+            sqp_line[:,lineidx] .= (Ctron * x + dtron)[3:8] #write to sqp_line
+            f = tron.f #wont match with IPOPT obj because constant terms are ignored here
+            trg = tron.g
+
+
+
+            # violation on 1h,1i,1j,1k
+            cviol3 = dot(vec_1j, Ctron * x + dtron) - RH_1j
+            cviol4 = dot(vec_1k, Ctron * x + dtron) - RH_1k
+
+            cnorm = max(abs(cviol3), abs(cviol4)) #worst violation
+
+            if cnorm <= eta
+                if cnorm <= 1e-6
+                    terminate = true
+                else
+                    membuf[3,lineidx] += mu*cviol3 #λ_1j
+                    membuf[4,lineidx] += mu*cviol4 #λ_1k
+
+                    eta = eta / mu^0.9
+                end
+            else
+                mu = min(mu_max, mu*10) #increase penalty
+                eta = 1 / mu^0.1
+                membuf[5,lineidx] = mu #save penalty for current inner iteration
+            end
+
+            if it >= max_auglag && cnorm > 1e-6 #maximum iteration for auglag
+                println()
+                println("max_auglag reached for line with cnorm = ", cnorm, " for line = ", lineidx, " with mu ", mu)
+
+                terminate = true
+            end
+        end #end while ALM
+
+
+
+    #save variables
+    u[shift_idx] = dot(supY[1,:],Ctron * x + dtron) + line_res[1]#pij
+    u[shift_idx + 1] = dot(supY[2,:],Ctron * x + dtron) + line_res[2]#qij
+    u[shift_idx + 2] = dot(supY[3,:],Ctron * x + dtron) + line_res[3]#pji
+    u[shift_idx + 3] = dot(supY[4,:],Ctron * x + dtron) + line_res[4]#qji
+    u[shift_idx + 4] = x[3] #wi
+    u[shift_idx + 5] = x[4] #wj
+    u[shift_idx + 6] = x[5] #thetai
+    u[shift_idx + 7] = x[6] #thetaj
+
+    #get multiplier
+    tmpH = inv([LH_1h[1]  LH_1i[1]; LH_1h[2]  LH_1i[2]])
+    tmp14_i = [2*u[shift_idx]*YftR + 2*u[shift_idx + 1]*(-YftI), 2*u[shift_idx]*YftI + 2*u[shift_idx + 1]*YftR]
+    tmp14_h = [2*u[shift_idx + 2]*YtfR + 2*u[shift_idx + 3]*(-YtfI), 2*u[shift_idx + 2]*-YtfI + 2*u[shift_idx + 3]*(-YtfR)]
+    #14h 14i
+    lambda[1:2,lineidx] = -tmpH*(trg[1]*tmp14_i + trg[2]*tmp14_h + Hbr[1:2,1:2]*sqp_line[1:2,lineidx] + Hbr[1:2,3:6]*sqp_line[3:6,lineidx] + bbr[1:2])
+    #14j
+    lambda[3,lineidx] = -abs(trg[1]) #<=0 one_side ineq
+    #14k
+    lambda[4,lineidx] = -abs(trg[2]) #<=0 one-side ineq
+
+    return Ctron * x + dtron, f
+end
diff --git a/src/models/qpsub/qpsub_auglag_Ab_linelimit_kernel_red_gpu.jl b/src/models/qpsub/qpsub_auglag_Ab_linelimit_kernel_red_gpu.jl
new file mode 100644
index 0000000..ce5e00b
--- /dev/null
+++ b/src/models/qpsub/qpsub_auglag_Ab_linelimit_kernel_red_gpu.jl
@@ -0,0 +1,281 @@
+"""
+    auglag_linelimit_qpsub()
+
+- for certain line (i,j), update sol.u[pij_idx]
+- use Exatron, eval_A_b_branch_kernel_gpu_qpsub, eval_A_b_auglag_branch_kernel_gpu_qpsub_red, tron_gpu_test
+- LANCELOT ALM algorithm
+- with elimination of w_ijR and w_ijI (v2 in overleaf)
+- with multiplier output
+"""
+
+
+"""
+   Internal Solution Structure for branch
+
+- branch structure from u (8*nline):
+    - |p_ij   | q_ij  | p_ji   | q_ji    | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji) |
+
+- branch structure for Exatron (8*nline):
+    - | t_ij(linelimit) | t_ji(linelimit) | w_ijR  |  w_ijI   | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji)
+
+- branch structure for Exatron (6*nline): eliminate w_ijR, wij_I
+    - | t_ij(linelimit) | t_ji(linelimit) | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji)
+
+- Hessian inherited from SQP ie sqp_line (6*nline):
+    - |w_ijR  | w_ijI |  wi(ij) | wj(ji) |  thetai(ij) |  thetaj(ji)|
+"""
+
+
+function auglag_linelimit_qpsub(Hs, l_curr, rho, u_curr, v_curr, z_curr, YffR, YffI,
+    YftR, YftI, YttR, YttI, YtfR, YtfI, inner, max_auglag, mu_max, scale, lqp, uqp, sqp_line,
+    membuf, LH_1h, RH_1h, LH_1i, RH_1i, LH_1j, RH_1j, LH_1k, RH_1k, lambda, line_start, nline, supY, line_res)
+
+    tx = threadIdx().x
+    lineidx = blockIdx().x
+    shift_idx = line_start + 8*(lineidx-1)
+
+
+    n = 6
+
+    x = CuDynamicSharedArray(Float64, n) #memory allocation
+    xl = CuDynamicSharedArray(Float64, n, n*sizeof(Float64))
+    xu = CuDynamicSharedArray(Float64, n, (2*n)*sizeof(Float64))
+    trg = CuDynamicSharedArray(Float64, n, (3*n)*sizeof(Float64))
+    Hbr = CuDynamicSharedArray(Float64, (n,n), (4*n)*sizeof(Float64))
+    bbr = CuDynamicSharedArray(Float64, n, (4*n + n^2)*sizeof(Float64))
+    vec_1j = CuDynamicSharedArray(Float64, 8, (5*n + n^2)*sizeof(Float64))
+    vec_1k = CuDynamicSharedArray(Float64, 8, (5*n + n^2 + 8)*sizeof(Float64))
+    Ctron = CuDynamicSharedArray(Float64, (8,6), (5*n + n^2 + 16)*sizeof(Float64))
+    dtron = CuDynamicSharedArray(Float64, 8, (5*n + n^2 + 64)*sizeof(Float64))
+    A_aug = CuDynamicSharedArray(Float64, (8,8), (5*n + n^2 + 72)*sizeof(Float64))
+    Atron = CuDynamicSharedArray(Float64, (6,6), (5*n + n^2 + 136)*sizeof(Float64))
+    btron = CuDynamicSharedArray(Float64, 6, (5*n + n^2 + 172)*sizeof(Float64))
+
+
+    #initialization: variable wrt branch structure wrt Exatron
+    x[1] = 0.0
+    x[2] = 0.0
+    x[3] = sqp_line[3,lineidx]
+    x[4] = sqp_line[4,lineidx]
+    x[5] = sqp_line[5,lineidx]
+    x[6] = sqp_line[6,lineidx]
+
+    xl[1] = 0.0
+    xl[2] = 0.0
+    xl[3] = lqp[lineidx,3]
+    xl[4] = lqp[lineidx,4]
+    xl[5] = lqp[lineidx,5]
+    xl[6] = lqp[lineidx,6]
+
+    xu[1] = 200000.0
+    xu[2] = 200000.0
+    xu[3] = uqp[lineidx,3]
+    xu[4] = uqp[lineidx,4]
+    xu[5] = uqp[lineidx,5]
+    xu[6] = uqp[lineidx,6]
+
+    trg[1] = 0.0
+    trg[2] = 0.0
+    trg[3] = 0.0
+    trg[4] = 0.0
+    trg[5] = 0.0
+    trg[6] = 0.0
+
+    eval_A_b_branch_kernel_gpu_qpsub(
+    Hs, l_curr, rho, v_curr, z_curr, Hbr, bbr, lineidx, shift_idx, supY, line_res)
+
+    vec_1j[1] = 1 + LH_1j[lineidx,1]*supY[4*(lineidx - 1)+1,1] + LH_1j[lineidx,2]*supY[4*(lineidx - 1)+2,1]
+    vec_1j[2] = LH_1j[lineidx,1]*supY[4*(lineidx - 1)+1,2] + LH_1j[lineidx,2]*supY[4*(lineidx - 1)+2,2]
+    vec_1j[3] = LH_1j[lineidx,1]*supY[4*(lineidx - 1)+1,3] + LH_1j[lineidx,2]*supY[4*(lineidx - 1)+2,3]
+    vec_1j[4] = LH_1j[lineidx,1]*supY[4*(lineidx - 1)+1,4] + LH_1j[lineidx,2]*supY[4*(lineidx - 1)+2,4]
+    vec_1j[5] = LH_1j[lineidx,1]*supY[4*(lineidx - 1)+1,5] + LH_1j[lineidx,2]*supY[4*(lineidx - 1)+2,5]
+    vec_1j[6] = LH_1j[lineidx,1]*supY[4*(lineidx - 1)+1,6] + LH_1j[lineidx,2]*supY[4*(lineidx - 1)+2,6]
+    vec_1j[7] = LH_1j[lineidx,1]*supY[4*(lineidx - 1)+1,7] + LH_1j[lineidx,2]*supY[4*(lineidx - 1)+2,7]
+    vec_1j[8] = LH_1j[lineidx,1]*supY[4*(lineidx - 1)+1,8] + LH_1j[lineidx,2]*supY[4*(lineidx - 1)+2,8]
+
+    vec_1k[1] = LH_1k[lineidx,1]*supY[4*(lineidx - 1)+3,1] + LH_1k[lineidx,2]*supY[4*(lineidx - 1)+4,1]
+    vec_1k[2] = 1 + LH_1k[lineidx,1]*supY[4*(lineidx - 1)+3,2] + LH_1k[lineidx,2]*supY[4*(lineidx - 1)+4,2]
+    vec_1k[3] = LH_1k[lineidx,1]*supY[4*(lineidx - 1)+3,3] + LH_1k[lineidx,2]*supY[4*(lineidx - 1)+4,3]
+    vec_1k[4] = LH_1k[lineidx,1]*supY[4*(lineidx - 1)+3,4] + LH_1k[lineidx,2]*supY[4*(lineidx - 1)+4,4]
+    vec_1k[5] = LH_1k[lineidx,1]*supY[4*(lineidx - 1)+3,5] + LH_1k[lineidx,2]*supY[4*(lineidx - 1)+4,5]
+    vec_1k[6] = LH_1k[lineidx,1]*supY[4*(lineidx - 1)+3,6] + LH_1k[lineidx,2]*supY[4*(lineidx - 1)+4,6]
+    vec_1k[7] = LH_1k[lineidx,1]*supY[4*(lineidx - 1)+3,7] + LH_1k[lineidx,2]*supY[4*(lineidx - 1)+4,7]
+    vec_1k[8] = LH_1k[lineidx,1]*supY[4*(lineidx - 1)+3,8] + LH_1k[lineidx,2]*supY[4*(lineidx - 1)+4,8]
+
+
+    prod = LH_1h[lineidx,1]*LH_1i[lineidx,2]-LH_1h[lineidx,2]*LH_1i[lineidx,1]
+    inv11 =  LH_1i[lineidx,2]/prod
+    inv12 = -LH_1h[lineidx,2]/prod
+    inv21 = -LH_1i[lineidx,1]/prod
+    inv22 = LH_1h[lineidx,1]/prod
+    fill!(Ctron,0.0)
+    Ctron[1,1] = 1.0
+    Ctron[2,2] = 1.0
+    Ctron[3,1] = 0.0
+    Ctron[3,2] = 0.0
+    Ctron[3,3] = -inv11*LH_1h[lineidx,3]
+    Ctron[3,4] = -inv11*LH_1h[lineidx,4]
+    Ctron[3,5] = -inv12*LH_1i[lineidx,3]
+    Ctron[3,6] = -inv12*LH_1i[lineidx,4]
+    Ctron[4,1] = 0.0
+    Ctron[4,2] = 0.0
+    Ctron[4,3] = -inv21*LH_1h[lineidx,3]
+    Ctron[4,4] = -inv21*LH_1h[lineidx,4]
+    Ctron[4,5] = -inv22*LH_1i[lineidx,3]
+    Ctron[4,6] = -inv22*LH_1i[lineidx,4]
+    Ctron[5,3] = 1.0
+    Ctron[6,4] = 1.0
+    Ctron[7,5] = 1.0
+    Ctron[8,6] = 1.0
+
+
+    dtron[1] = 0.0
+    dtron[2] = 0.0
+    dtron[3] = inv11*RH_1h[lineidx] + inv12*RH_1i[lineidx]
+    dtron[4] = inv21*RH_1h[lineidx]  + inv22*RH_1i[lineidx]
+    dtron[5] = 0.0
+    dtron[6] = 0.0
+    dtron[7] = 0.0
+    dtron[8] = 0.0
+
+
+    # initialization on penalty
+    if inner == 1 #info.inner = 1 (first inner iteration)
+        membuf[5,lineidx] = 10.0 #reset ρ_1h = ρ_1i = ρ_1j = ρ_1k (let ρ the same for all AL terms)
+        mu = 10.0 # set mu = initial ρ_*
+    else
+        mu = membuf[5,lineidx] #inherit mu = ρ from the previous inner iteration
+    end
+
+
+
+    CUDA.sync_threads()
+    # set internal parameters eta omega mu to guide ALM convergence
+    eta = 1 / mu^0.1
+
+
+    it = 0 #iteration count
+    terminate = false #termination status
+
+
+
+
+
+    while !terminate
+         it += 1
+
+        eval_A_b_auglag_branch_kernel_gpu_qpsub_red(Hbr, bbr, A_aug, Atron, btron, scale,vec_1j,vec_1k,membuf,lineidx, Ctron,dtron,RH_1j,RH_1k)
+
+        status, minor_iter = tron_gpu_test(n,Atron,btron,x,xl,xu)
+
+        sqp0 = Ctron[1,1] * x[1] + Ctron[1,2]*x[2] + Ctron[1,3]*x[3] + Ctron[1,4]*x[4] +
+                Ctron[1,5]*x[5] + Ctron[1,6]*x[6] + dtron[1]
+        sqp1 = Ctron[2,1] * x[1] + Ctron[2,2]*x[2] + Ctron[2,3]*x[3] + Ctron[2,4]*x[4] +
+                Ctron[2,5]*x[5] + Ctron[2,6]*x[6] + dtron[2]
+
+        sqp_line[1,lineidx] = Ctron[3,1] * x[1] + Ctron[3,2]*x[2] + Ctron[3,3]*x[3] + Ctron[3,4]*x[4] +
+                                Ctron[3,5]*x[5] + Ctron[3,6]*x[6] + dtron[3]
+        sqp_line[2,lineidx] = Ctron[4,1] * x[1] + Ctron[4,2]*x[2] + Ctron[4,3]*x[3] + Ctron[4,4]*x[4] +
+                                Ctron[4,5]*x[5] + Ctron[4,6]*x[6] + dtron[4]
+        sqp_line[3,lineidx] = Ctron[5,1] * x[1] + Ctron[5,2]*x[2] + Ctron[5,3]*x[3] + Ctron[5,4]*x[4] +
+                                Ctron[5,5]*x[5] + Ctron[5,6]*x[6] + dtron[5]
+        sqp_line[4,lineidx] = Ctron[6,1] * x[1] + Ctron[6,2]*x[2] + Ctron[6,3]*x[3] + Ctron[6,4]*x[4] +
+                                Ctron[6,5]*x[5] + Ctron[6,6]*x[6] + dtron[6]
+        sqp_line[5,lineidx] = Ctron[7,1] * x[1] + Ctron[7,2]*x[2] + Ctron[7,3]*x[3] + Ctron[7,4]*x[4] +
+                                Ctron[7,5]*x[5] + Ctron[7,6]*x[6] + dtron[7]
+        sqp_line[6,lineidx] = Ctron[8,1] * x[1] + Ctron[8,2]*x[2] + Ctron[8,3]*x[3] + Ctron[8,4]*x[4] +
+                                Ctron[8,5]*x[5] + Ctron[8,6]*x[6] + dtron[8]
+
+
+
+        cviol3 = vec_1j[1]*sqp0 + vec_1j[2]*sqp1 + vec_1j[3]*sqp_line[1,lineidx] + vec_1j[4]*sqp_line[2,lineidx] + vec_1j[5]*sqp_line[3,lineidx] +
+                    vec_1j[6]*sqp_line[4,lineidx] + vec_1j[7]*sqp_line[5,lineidx] + vec_1j[8]*sqp_line[6,lineidx] - RH_1j[lineidx]
+
+        cviol4 = vec_1k[1]*sqp0 + vec_1k[2]*sqp1 + vec_1k[3]*sqp_line[1,lineidx] + vec_1k[4]*sqp_line[2,lineidx] + vec_1k[5]*sqp_line[3,lineidx] +
+                    vec_1k[6]*sqp_line[4,lineidx] + vec_1k[7]*sqp_line[5,lineidx] + vec_1k[8]*sqp_line[6,lineidx] - RH_1k[lineidx]
+
+        cnorm = max(abs(cviol3), abs(cviol4)) #worst violation
+
+        if cnorm <= eta
+            if cnorm <= 1e-6
+                terminate = true
+            else
+                if tx == 1
+                    membuf[3,lineidx] += mu*cviol3 #λ_1j
+                    membuf[4,lineidx] += mu*cviol4 #λ_1k
+                end
+                eta = eta / mu^0.9
+
+            end
+        else
+            mu = min(mu_max, mu*10) #increase penalty
+            eta = 1 / mu^0.1
+            membuf[5,lineidx] = mu #save penalty for current inner iteration
+        end
+
+        if it >= max_auglag #maximum iteration for auglag
+            if tx == 1
+                @cuprintln("max iteration reach at block = ",lineidx, "and threads = ",tx)
+            end
+            terminate = true
+        end
+
+        CUDA.sync_threads()
+    end #end while ALM
+
+
+
+    # #save variables
+    u_curr[shift_idx] = supY[4*(lineidx - 1) + 1,3]*sqp_line[1,lineidx] + supY[4*(lineidx - 1) + 1,4]*sqp_line[2,lineidx] + supY[4*(lineidx - 1) + 1,5]*sqp_line[3,lineidx] +
+                    supY[4*(lineidx - 1) + 1,6]*sqp_line[4,lineidx] + supY[4*(lineidx - 1) + 1,7]*sqp_line[5,lineidx] + supY[4*(lineidx - 1) + 1,8]*sqp_line[6,lineidx] + line_res[1,lineidx]
+
+    u_curr[shift_idx + 1] = supY[4*(lineidx - 1) + 2,3]*sqp_line[1,lineidx] + supY[4*(lineidx - 1) + 2,4]*sqp_line[2,lineidx] + supY[4*(lineidx - 1) + 2,5]*sqp_line[3,lineidx] +
+                    supY[4*(lineidx - 1) + 2,6]*sqp_line[4,lineidx] + supY[4*(lineidx - 1) + 2,7]*sqp_line[5,lineidx] + supY[4*(lineidx - 1) + 2,8]*sqp_line[6,lineidx] + line_res[2,lineidx]
+
+    u_curr[shift_idx + 2] = supY[4*(lineidx - 1) + 3,3]*sqp_line[1,lineidx] + supY[4*(lineidx - 1) + 3,4]*sqp_line[2,lineidx] + supY[4*(lineidx - 1) + 3,5]*sqp_line[3,lineidx] +
+                    supY[4*(lineidx - 1) + 3,6]*sqp_line[4,lineidx] + supY[4*(lineidx - 1) + 3,7]*sqp_line[5,lineidx] + supY[4*(lineidx - 1) + 3,8]*sqp_line[6,lineidx] + line_res[3,lineidx]
+
+    u_curr[shift_idx + 3] = supY[4*(lineidx - 1) + 4,3]*sqp_line[1,lineidx] + supY[4*(lineidx - 1) + 4,4]*sqp_line[2,lineidx] + supY[4*(lineidx - 1) + 4,5]*sqp_line[3,lineidx] +
+                    supY[4*(lineidx - 1) + 4,6]*sqp_line[4,lineidx] + supY[4*(lineidx - 1) + 4,7]*sqp_line[5,lineidx] + supY[4*(lineidx - 1) + 4,8]*sqp_line[6,lineidx] + line_res[4,lineidx]
+
+    u_curr[shift_idx + 4] = x[3] #wi
+    u_curr[shift_idx + 5] = x[4] #wj
+    u_curr[shift_idx + 6] = x[5] #thetai
+    u_curr[shift_idx + 7] = x[6] #thetaj
+
+
+    #get multiplier
+    tmpH11 = inv11
+    tmpH12 = inv21
+    tmpH21 = inv12
+    tmpH22 = inv22
+
+    tmp14i_1 = 2*u_curr[shift_idx]*YftR[lineidx] + 2*u_curr[shift_idx + 1]*(-YftI[lineidx])
+    tmp14i_2 = 2*u_curr[shift_idx]*YftI[lineidx] + 2*u_curr[shift_idx + 1]*YftR[lineidx]
+
+    tmp14h_1 = 2*u_curr[shift_idx + 2]*YtfR[lineidx] + 2*u_curr[shift_idx + 3]*(-YtfI[lineidx])
+    tmp14h_2 = 2*u_curr[shift_idx + 2]*(-YtfI[lineidx]) + 2*u_curr[shift_idx + 3]*(-YtfR[lineidx])
+
+    trg[1] = Atron[1,1] * x[1] + Atron[1,2]*x[2] + Atron[1,3]*x[3] + Atron[1,4]*x[4] +
+            Atron[1,5]*x[5] + Atron[1,6]*x[6] + btron[1]
+
+    trg[2] = Atron[2,1] * x[1] + Atron[2,2]*x[2] + Atron[2,3]*x[3] + Atron[2,4]*x[4] +
+            Atron[2,5]*x[5] + Atron[2,6]*x[6] + btron[2]
+
+    lambda[3,lineidx] = -abs(trg[1])
+    lambda[4,lineidx] = -abs(trg[2])
+
+    rhs_1 = trg[1]*tmp14i_1 + trg[2]*tmp14h_1 + Hbr[1,1]*sqp_line[1,lineidx] + Hbr[1,2]*sqp_line[2,lineidx] +
+                Hbr[1,3]*sqp_line[3,lineidx] + Hbr[1,4]*sqp_line[4,lineidx] + Hbr[1,5]*sqp_line[5,lineidx] +
+                Hbr[1,6]*sqp_line[6,lineidx] + bbr[1]
+
+    rhs_2 = trg[1]*tmp14i_2 + trg[2]*tmp14h_2 + Hbr[2,1]*sqp_line[1,lineidx] + Hbr[2,2]*sqp_line[2,lineidx] +
+                Hbr[2,3]*sqp_line[3,lineidx] + Hbr[2,4]*sqp_line[4,lineidx] + Hbr[2,5]*sqp_line[5,lineidx] +
+                Hbr[2,6]*sqp_line[6,lineidx] + bbr[2]
+
+    lambda[1,lineidx] = -tmpH11*rhs_1 - tmpH12*rhs_2
+    lambda[2,lineidx] = -tmpH21*rhs_1 - tmpH22*rhs_2
+
+    CUDA.sync_threads()
+    return
+end
diff --git a/src/models/qpsub/qpsub_auglag_tron_linelimit_kernel_cpu.jl b/src/models/qpsub/qpsub_auglag_tron_linelimit_kernel_cpu.jl
new file mode 100644
index 0000000..f39ae96
--- /dev/null
+++ b/src/models/qpsub/qpsub_auglag_tron_linelimit_kernel_cpu.jl
@@ -0,0 +1,72 @@
+"""
+    build_QP_*()
+
+- build any box-constrained QP with Exatron.createproblem()
+- implement eval_f, eval_g, eval_h callback functions
+"""
+
+function build_QP_SP(A::Matrix{Float64}, b::Array{Float64, 1}, l::Array{Float64, 1}, u::Array{Float64, 1})
+    n=length(b)
+    P = sparse(A) #make A sparse to use nnz() and findnz()
+    q = b
+    Iz, Jz, vals = findnz(P) #does not see symmetric record n*n
+
+    eval_f(x) = 0.5 * dot(x, P, x) + dot(q, x) #obj watch out for 1/2
+
+    function eval_g(x, g)
+        fill!(g, 0)
+        mul!(g, P, x)
+        g .+= q
+    end
+
+    #eval_h store all n*n entries
+    function eval_h(x, mode, rows, cols, obj_factor, lambda, values)
+        if mode == :Structure
+            for i in 1:nnz(P) #does not symmetric
+                rows[i] = Iz[i]
+                cols[i] = Jz[i]
+            end
+        else
+            copy!(values, vals)
+        end
+    end
+
+    return ExaTron.createProblem(n, l, u, nnz(P), eval_f, eval_g, eval_h) #original without param
+end
+
+
+function build_QP_DS(A::Matrix{Float64}, b::Array{Float64, 1}, l::Array{Float64, 1}, u::Array{Float64, 1})
+    n=length(b)
+
+
+    eval_f(x) = 0.5 * dot(x, A, x) + dot(b, x) #obj watch out for 1/2
+
+    function eval_g(x, g)
+        fill!(g, 0)
+        mul!(g, A, x)
+        g .+= b
+    end
+
+    function eval_h(x, mode, rows, cols, obj_factor, lambda, values)
+        if mode == :Structure
+            nz = 1
+            for j=1:n
+                for i in j:n
+                    rows[nz] = i
+                    cols[nz] = j
+                    nz += 1
+                end
+            end
+        else
+            nz = 1
+            for j=1:n
+                for i in j:n
+                    values[nz] = A[i, j]
+                    nz += 1
+                end
+            end
+        end
+    end
+    #with Youngdae's Param
+    return ExaTron.createProblem(n, l, u, Int64((n+1)*n/2), eval_f, eval_g, eval_h; :tol=> 1e-6, :matrix_type => :Dense, :max_minor => 200, :frtol => 1e-12)
+end
diff --git a/src/models/qpsub/qpsub_eval_Ab_linelimit_kernel_cpu.jl b/src/models/qpsub/qpsub_eval_Ab_linelimit_kernel_cpu.jl
new file mode 100644
index 0000000..22b5e34
--- /dev/null
+++ b/src/models/qpsub/qpsub_eval_Ab_linelimit_kernel_cpu.jl
@@ -0,0 +1,254 @@
+"""
+    eval_A_*(), eval_b_*()
+
+- prepare call backs for build_QP_DS and IPOPT benchmark (solve branch kernel directly)
+- use mod.membuf
+"""
+
+
+"""
+   Internal Solution Structure for branch
+
+- branch structure from u (8*nline):
+    - |p_ij   | q_ij  | p_ji   | q_ji    | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji) |
+
+- branch structure for Exatron (8*nline):
+    - | t_ij(linelimit) | t_ji(linelimit) | w_ijR  |  w_ijI   | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji)
+
+- branch structure for Exatron (6*nline): eliminate w_ijR, wij_I
+    - | t_ij(linelimit) | t_ji(linelimit) | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji)
+
+- Hessian inherited from SQP (6*nline):
+    - |w_ijR  | w_ijI |  wi(ij) | wj(ji) |  thetai(ij) |  thetaj(ji)|
+"""
+
+
+
+function eval_A_branch_kernel_cpu_qpsub(
+    H::Array{Float64,2},l::Array{Float64,1}, rho::Array{Float64,1}, v::Array{Float64,1}, z_curr::Array{Float64,1},
+    YffR::Float64, YffI::Float64,
+    YftR::Float64, YftI::Float64,
+    YttR::Float64, YttI::Float64,
+    YtfR::Float64, YtfI::Float64)
+
+
+
+    #linear transform pij qij pji qji wrt Hessian inherited structure
+    supY = [YftR YftI YffR 0 0 0;
+    -YftI YftR -YffI 0 0 0;
+    YtfR -YtfI 0 YttR 0 0;
+    -YtfI -YtfR 0 -YttI 0 0]
+
+    H_new = H
+    @inbounds begin
+
+            H_new .+= rho[1]*supY[1,:]*transpose(supY[1,:]) #pij
+            H_new .+= rho[2]*supY[2,:]*transpose(supY[2,:]) #qij
+            H_new .+= rho[3]*supY[3,:]*transpose(supY[3,:]) #pji
+            H_new .+= rho[4]*supY[4,:]*transpose(supY[4,:]) #qji
+
+            H_new[3,3] += rho[5] #wi(ij)
+            H_new[4,4] += rho[6] #wj(ji)
+            H_new[5,5] += rho[7] #thetai(ij)
+            H_new[6,6] += rho[8] #thetaj(ji)
+    end
+
+    return H #6*6 #not scale
+end
+
+function eval_A_auglag_branch_kernel_cpu_qpsub(
+    Hbr::Array{Float64,2},l::Array{Float64,1}, rho::Array{Float64,1},
+    v::Array{Float64,1}, z_curr::Array{Float64,1}, membuf::Array{Float64,1},
+    YffR::Float64, YffI::Float64,
+    YftR::Float64, YftI::Float64,
+    YttR::Float64, YttI::Float64,
+    YtfR::Float64, YtfI::Float64,
+    LH_1h::Array{Float64,1}, RH_1h::Float64,
+    LH_1i::Array{Float64,1}, RH_1i::Float64, LH_1j::Array{Float64,1},RH_1j::Float64, LH_1k::Array{Float64,1},RH_1k::Float64,scale::Float64)
+
+    A = zeros(8,8)
+    A[3:8,3:8] = Hbr
+
+
+    #pij qij pji qji wrt branch structure ExaTron
+    supY = [0 0 YftR YftI YffR 0 0 0;
+    0 0 -YftI YftR -YffI 0 0 0;
+    0 0 YtfR -YtfI 0 YttR 0 0;
+    0 0 -YtfI -YtfR 0 -YttI 0 0]
+
+    #ALM on equality constraint wrt branch structure ExaTron
+    vec_1h = [0, 0, LH_1h[1], LH_1h[2], LH_1h[3], LH_1h[4], 0, 0 ] #1h
+    vec_1i = [0, 0, LH_1i[1], LH_1i[2], 0, 0, LH_1i[3], LH_1i[4]]  #1i
+    vec_1j = [1, 0, 0, 0, 0, 0, 0, 0] + LH_1j[1]* supY[1,:] + LH_1j[2]* supY[2,:] #1j with t_ij
+    vec_1k = [0, 1, 0, 0, 0, 0, 0, 0] + LH_1k[1]* supY[3,:] + LH_1k[2]* supY[4,:] #1k with t_ji
+
+    @inbounds begin
+            A .+= membuf[5]*vec_1h*transpose(vec_1h) #add auglag for 1h
+
+            A .+= membuf[5]*vec_1i*transpose(vec_1i) #add auglag for 1i
+
+            A .+= membuf[5]*vec_1j*transpose(vec_1j) #add auglag for 1j
+
+            A .+= membuf[5]*vec_1k*transpose(vec_1k) #add auglag for 1k
+    end
+
+    return A*scale #dim = 8*8 scaled for non-red
+end
+
+function eval_A_auglag_branch_kernel_cpu_qpsub_red(
+    Hbr::Array{Float64,2},l::Array{Float64,1}, rho::Array{Float64,1},
+    v::Array{Float64,1}, z_curr::Array{Float64,1}, membuf::Array{Float64,1},
+    YffR::Float64, YffI::Float64,
+    YftR::Float64, YftI::Float64,
+    YttR::Float64, YttI::Float64,
+    YtfR::Float64, YtfI::Float64,
+    LH_1h::Array{Float64,1}, RH_1h::Float64,
+    LH_1i::Array{Float64,1}, RH_1i::Float64, LH_1j::Array{Float64,1},RH_1j::Float64, LH_1k::Array{Float64,1},RH_1k::Float64,scale::Float64)
+
+    A = zeros(8,8)
+    A[3:8,3:8] = Hbr
+
+
+    #pij qij pji qji wrt branch structure ExaTron
+    supY = [0 0 YftR YftI YffR 0 0 0;
+    0 0 -YftI YftR -YffI 0 0 0;
+    0 0 YtfR -YtfI 0 YttR 0 0;
+    0 0 -YtfI -YtfR 0 -YttI 0 0]
+
+    #ALM on equality constraint wrt branch structure ExaTron
+    vec_1j = [1, 0, 0, 0, 0, 0, 0, 0] + LH_1j[1]* supY[1,:] + LH_1j[2]* supY[2,:] #1j with t_ij
+    vec_1k = [0, 1, 0, 0, 0, 0, 0, 0] + LH_1k[1]* supY[3,:] + LH_1k[2]* supY[4,:] #1k with t_ji
+
+    @inbounds begin
+            A .+= membuf[5]*vec_1j*transpose(vec_1j) #add auglag for 1j
+
+            A .+= membuf[5]*vec_1k*transpose(vec_1k) #add auglag for 1k
+    end
+
+
+
+    inv_ij = inv([LH_1h[1]  LH_1h[2]; LH_1i[1]  LH_1i[2]])
+    C_ij = -inv_ij * [0  0  LH_1h[3] LH_1h[4] 0 0; 0 0 0 0 LH_1i[3] LH_1i[4]]
+    C = zeros(8,6)
+    C[1,1] = C[2,2] = 1
+    C[3:4,:] .= C_ij
+    C[5,3] = 1
+    C[6,4] = 1
+    C[7,5] = 1
+    C[8,6] = 1
+
+    return A, transpose(C)*A*C*scale #dim 8*8 no scale + dim = 6*6 scaled for red
+end
+
+function eval_b_branch_kernel_cpu_qpsub(
+    l::Array{Float64,1}, rho::Array{Float64,1}, v::Array{Float64,1}, z::Array{Float64,1},
+    YffR::Float64, YffI::Float64,
+    YftR::Float64, YftI::Float64,
+    YttR::Float64, YttI::Float64,
+    YtfR::Float64, YtfI::Float64, res::Array{Float64,1})
+
+
+    supY = [YftR YftI YffR 0 0 0;
+    -YftI YftR -YffI 0 0 0;
+    YtfR -YtfI 0 YttR 0 0;
+    -YtfI -YtfR 0 -YttI 0 0]
+
+    b = zeros(6)
+
+    @inbounds begin
+        b .+= (l[1] - rho[1]*(v[1]-z[1]-res[1])) * supY[1,:] #pij
+        b .+=  (l[2] - rho[2]*(v[2]-z[2]-res[2])) * supY[2,:] #qij
+        b .+= (l[3] - rho[3]*(v[3]-z[3]-res[3])) * supY[3,:] #pji
+        b .+=  (l[4] - rho[4]*(v[4]-z[4]-res[4])) * supY[4,:] #qji
+        b[3] += (l[5] - rho[5]*(v[5]-z[5])) #wi(ij)
+        b[4] += (l[6] - rho[6]*(v[6]-z[6])) #wj(ji)
+        b[5] += (l[7] - rho[7]*(v[7]-z[7])) #thetai(ij)
+        b[6] += (l[8] - rho[8]*(v[8]-z[8])) #thetaj(ji)
+    end
+
+    return b #size 6 noscale
+end
+
+function eval_b_auglag_branch_kernel_cpu_qpsub(
+    bbr::Array{Float64,1}, l::Array{Float64,1}, rho::Array{Float64,1},
+    v::Array{Float64,1}, z_curr::Array{Float64,1}, membuf::Array{Float64,1},
+    YffR::Float64, YffI::Float64,
+    YftR::Float64, YftI::Float64,
+    YttR::Float64, YttI::Float64,
+    YtfR::Float64, YtfI::Float64, LH_1h::Array{Float64,1}, RH_1h::Float64,
+    LH_1i::Array{Float64,1}, RH_1i::Float64, LH_1j::Array{Float64,1},RH_1j::Float64, LH_1k::Array{Float64,1},RH_1k::Float64, scale::Float64)
+
+    b = zeros(8)
+    b[3:8] = bbr
+
+
+    #pij qij pji qji wrt branch structure ExaTron
+    supY = [0 0 YftR YftI YffR 0 0 0;
+    0 0 -YftI YftR -YffI 0 0 0;
+    0 0 YtfR -YtfI 0 YttR 0 0;
+    0 0 -YtfI -YtfR 0 -YttI 0 0]
+
+    #ALM on equality constraint wrt branch structure ExaTron
+    vec_1h = [0, 0, LH_1h[1], LH_1h[2], LH_1h[3], LH_1h[4], 0, 0 ] #1h
+    vec_1i = [0, 0, LH_1i[1], LH_1i[2], 0, 0, LH_1i[3], LH_1i[4]] #1i
+    vec_1j = [1, 0, 0, 0, 0, 0, 0, 0] + LH_1j[1]* supY[1,:] + LH_1j[2]* supY[2,:] #1j with t_ij
+    vec_1k = [0, 1, 0, 0, 0, 0, 0, 0] + LH_1k[1]* supY[3,:] + LH_1k[2]* supY[4,:] #1k with t_ji
+
+    @inbounds begin
+            b .+= (membuf[1] - membuf[5]*RH_1h)*vec_1h #1h
+            b .+= (membuf[2] - membuf[5]*RH_1i)*vec_1i #1i
+            b .+= (membuf[3] - membuf[5]*RH_1j)*vec_1j #1j
+            b .+= (membuf[4] - membuf[5]*RH_1k)*vec_1k #1k
+    end
+
+    return b*scale #dim = 8 scaled
+end
+
+function eval_b_auglag_branch_kernel_cpu_qpsub_red(
+    A_aug::Array{Float64,2}, bbr::Array{Float64,1}, l::Array{Float64,1}, rho::Array{Float64,1},
+    v::Array{Float64,1}, z_curr::Array{Float64,1}, membuf::Array{Float64,1},
+    YffR::Float64, YffI::Float64,
+    YftR::Float64, YftI::Float64,
+    YttR::Float64, YttI::Float64,
+    YtfR::Float64, YtfI::Float64, LH_1h::Array{Float64,1}, RH_1h::Float64,
+    LH_1i::Array{Float64,1}, RH_1i::Float64, LH_1j::Array{Float64,1},RH_1j::Float64, LH_1k::Array{Float64,1},RH_1k::Float64, scale::Float64)
+
+    b = zeros(8)
+    b[3:8] = bbr
+
+
+    #pij qij pji qji wrt branch structure ExaTron
+    supY = [0 0 YftR YftI YffR 0 0 0;
+    0 0 -YftI YftR -YffI 0 0 0;
+    0 0 YtfR -YtfI 0 YttR 0 0;
+    0 0 -YtfI -YtfR 0 -YttI 0 0]
+
+    #ALM on equality constraint wrt branch structure ExaTron
+    vec_1j = [1, 0, 0, 0, 0, 0, 0, 0] + LH_1j[1]* supY[1,:] + LH_1j[2]* supY[2,:] #1j with t_ij
+    vec_1k = [0, 1, 0, 0, 0, 0, 0, 0] + LH_1k[1]* supY[3,:] + LH_1k[2]* supY[4,:] #1k with t_ji
+
+    @inbounds begin
+            b .+= (membuf[3] - membuf[5]*RH_1j)*vec_1j #1j
+            b .+= (membuf[4] - membuf[5]*RH_1k)*vec_1k #1k
+    end
+
+    inv_ij = inv([LH_1h[1] LH_1h[2]; LH_1i[1] LH_1i[2]])
+    C_ij = -inv_ij * [0 0 LH_1h[3] LH_1h[4] 0 0; 0 0 0 0 LH_1i[3] LH_1i[4]]
+    d_ij = inv_ij * [RH_1h; RH_1i]
+    C = zeros(8,6)
+    C[1,1] = C[2,2] = 1
+    C[3:4,:] .= C_ij
+    C[5,3] = 1
+    C[6,4] = 1
+    C[7,5] = 1
+    C[8,6] = 1
+
+    d=zeros(8)
+    d[3:4] .= d_ij
+
+
+
+
+
+    return C, d, transpose(C) * (A_aug * d  + b)*scale #dims = 8*6, 8, 6, coeff, coeff, scaled
+end
diff --git a/src/models/qpsub/qpsub_eval_Ab_linelimit_kernel_gpu.jl b/src/models/qpsub/qpsub_eval_Ab_linelimit_kernel_gpu.jl
new file mode 100644
index 0000000..9b7dba1
--- /dev/null
+++ b/src/models/qpsub/qpsub_eval_Ab_linelimit_kernel_gpu.jl
@@ -0,0 +1,180 @@
+"""
+   Internal Solution Structure for branch
+
+- branch structure from u (8*nline):
+    - |p_ij   | q_ij  | p_ji   | q_ji    | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji) |
+
+- branch structure for Exatron (8*nline):
+    - | t_ij(linelimit) | t_ji(linelimit) | w_ijR  |  w_ijI   | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji)
+
+- branch structure for Exatron (6*nline): eliminate w_ijR, wij_I
+    - | t_ij(linelimit) | t_ji(linelimit) | wi(ij) | wj(ji) | thetai(ij) | thetaj(ji)
+
+- Hessian inherited from SQP (6*nline):
+    - |w_ijR  | w_ijI |  wi(ij) | wj(ji) |  thetai(ij) |  thetaj(ji)|
+"""
+
+
+"""
+    eval_A_b_branch_kernel_gpu_qpsub()
+
+- prepare QP parameter of ADMM branch kernel (before ALM)
+"""
+
+
+function eval_A_b_branch_kernel_gpu_qpsub(
+    H, l, rho, v, z, A_ipopt, b_ipopt, id_line, shift_idx, supY, line_res)
+    tx = threadIdx().x
+    if tx == 1
+        @inbounds begin
+                for i = 1:6
+                    for j = 1:6
+                        A_ipopt[i, j] = H[6*(id_line - 1) + i, j] +
+                                rho[shift_idx]*supY[4*(id_line - 1) + 1,i+2]*supY[4*(id_line - 1) + 1,j+2] +
+                                rho[shift_idx + 1]*supY[4*(id_line - 1) + 2,i+2]*supY[4*(id_line - 1) + 2,j+2] +
+                                rho[shift_idx + 2]*supY[4*(id_line - 1) + 3,i+2]*supY[4*(id_line - 1) + 3,j+2] +
+                                rho[shift_idx + 3]*supY[4*(id_line - 1) + 4,i+2]*supY[4*(id_line - 1) + 4,j+2]
+                    end
+                end
+
+                for j = 1:6
+                    b_ipopt[j] = (l[shift_idx] - rho[shift_idx]*(v[shift_idx] - z[shift_idx] - line_res[1, id_line]))*supY[4*(id_line - 1) + 1,j+2] +
+                    (l[shift_idx+1] - rho[shift_idx+1]*(v[shift_idx+1] - z[shift_idx+1] - line_res[2, id_line]))*supY[4*(id_line - 1) + 2,j+2] +
+                    (l[shift_idx+2] - rho[shift_idx+2]*(v[shift_idx+2] - z[shift_idx+2] -  line_res[3, id_line]))*supY[4*(id_line - 1) + 3,j+2] +
+                    (l[shift_idx+3] - rho[shift_idx+3]*(v[shift_idx+3] - z[shift_idx+3] -  line_res[4, id_line]))*supY[4*(id_line - 1) + 4,j+2]
+                end
+
+                b_ipopt[3] += l[shift_idx+4] - rho[shift_idx+4]*(v[shift_idx+4] - z[shift_idx+4])
+                b_ipopt[4] += l[shift_idx+5] - rho[shift_idx+5]*(v[shift_idx+5] - z[shift_idx+5])
+                b_ipopt[5] += l[shift_idx+6] - rho[shift_idx+6]*(v[shift_idx+6] - z[shift_idx+6])
+                b_ipopt[6] += l[shift_idx+7] - rho[shift_idx+7]*(v[shift_idx+7] - z[shift_idx+7])
+
+                A_ipopt[3,3] += rho[shift_idx + 4]  #wi(ij)
+                A_ipopt[4,4] += rho[shift_idx + 5]  #wj(ji)
+                A_ipopt[5,5] += rho[shift_idx + 6]  #thetai(ij)
+                A_ipopt[6,6] += rho[shift_idx + 7]   #thetaj(ji)
+        end #@inbounds
+    end
+
+    CUDA.sync_threads()
+    return #6*6 #not scale
+end
+
+
+
+"""
+    eval_A_b_auglag_branch_kernel_gpu_qpsub_red()
+
+- prepare QP parameter of ADMM branch kernel (after model reduction and ALM)
+- Input for Tron GPU
+- use mod.membuf
+"""
+
+function eval_A_b_auglag_branch_kernel_gpu_qpsub_red(Hbr, bbr, A_aug, Atron, btron, scale,vec_1j,vec_1k,membuf,lineidx, Ctron,dtron,RH_1j,RH_1k)
+    tx = threadIdx().x
+    if tx == 1
+        fill!(A_aug, 0.0)
+        fill!(Atron, 0.0)
+
+        @inbounds begin
+                    for i = 3:8
+                        for j = 3:8
+                            A_aug[i,j] = Hbr[i-2,j-2]
+                        end
+                    end
+
+
+
+                    for i=1:8
+                        for j=1:8
+                            A_aug[i,j] += membuf[5,lineidx]*vec_1j[i]*vec_1j[j]
+                            A_aug[i,j] += membuf[5,lineidx]*vec_1k[i]*vec_1k[j]
+                        end
+                    end
+
+
+                    for i=1:6
+                        for j=1:6
+                            c1 = A_aug[1,1]*Ctron[1,j] + A_aug[1,2]*Ctron[2,j] + A_aug[1,3]*Ctron[3,j] + A_aug[1,4]*Ctron[4,j] +
+                                A_aug[1,5]*Ctron[5,j] + A_aug[1,6]*Ctron[6,j] + A_aug[1,7]*Ctron[7,j] + A_aug[1,8]*Ctron[8,j]
+
+                            c2 = A_aug[2,1]*Ctron[1,j] + A_aug[2,2]*Ctron[2,j] + A_aug[2,3]*Ctron[3,j] + A_aug[2,4]*Ctron[4,j] +
+                                A_aug[2,5]*Ctron[5,j] + A_aug[2,6]*Ctron[6,j] + A_aug[2,7]*Ctron[7,j] + A_aug[2,8]*Ctron[8,j]
+
+                            c3 = A_aug[3,1]*Ctron[1,j] + A_aug[3,2]*Ctron[2,j] + A_aug[3,3]*Ctron[3,j] + A_aug[3,4]*Ctron[4,j] +
+                                A_aug[3,5]*Ctron[5,j] + A_aug[3,6]*Ctron[6,j] + A_aug[3,7]*Ctron[7,j] + A_aug[3,8]*Ctron[8,j]
+
+                            c4 = A_aug[4,1]*Ctron[1,j] + A_aug[4,2]*Ctron[2,j] + A_aug[4,3]*Ctron[3,j] + A_aug[4,4]*Ctron[4,j] +
+                                A_aug[4,5]*Ctron[5,j] + A_aug[4,6]*Ctron[6,j] + A_aug[4,7]*Ctron[7,j] + A_aug[4,8]*Ctron[8,j]
+
+                            c5 = A_aug[5,1]*Ctron[1,j] + A_aug[5,2]*Ctron[2,j] + A_aug[5,3]*Ctron[3,j] + A_aug[5,4]*Ctron[4,j] +
+                                A_aug[5,5]*Ctron[5,j] + A_aug[5,6]*Ctron[6,j] + A_aug[5,7]*Ctron[7,j] + A_aug[5,8]*Ctron[8,j]
+
+                            c6 = A_aug[6,1]*Ctron[1,j] + A_aug[6,2]*Ctron[2,j] + A_aug[6,3]*Ctron[3,j] + A_aug[6,4]*Ctron[4,j] +
+                                A_aug[6,5]*Ctron[5,j] + A_aug[6,6]*Ctron[6,j] + A_aug[6,7]*Ctron[7,j] + A_aug[6,8]*Ctron[8,j]
+
+                            c7 = A_aug[7,1]*Ctron[1,j] + A_aug[7,2]*Ctron[2,j] + A_aug[7,3]*Ctron[3,j] + A_aug[7,4]*Ctron[4,j] +
+                                A_aug[7,5]*Ctron[5,j] + A_aug[7,6]*Ctron[6,j] + A_aug[7,7]*Ctron[7,j] + A_aug[7,8]*Ctron[8,j]
+
+                            c8 = A_aug[8,1]*Ctron[1,j] + A_aug[8,2]*Ctron[2,j] + A_aug[8,3]*Ctron[3,j] + A_aug[8,4]*Ctron[4,j] +
+                                A_aug[8,5]*Ctron[5,j] + A_aug[8,6]*Ctron[6,j] + A_aug[8,7]*Ctron[7,j] + A_aug[8,8]*Ctron[8,j]
+
+                            Atron[i,j] = scale*(Ctron[1,i]*c1 + Ctron[2,i]*c2 + Ctron[3,i]*c3 + Ctron[4,i]*c4 + Ctron[5,i]*c5 + Ctron[6,i]*c6 +
+                                            Ctron[7,i]*c7 + Ctron[8,i]*c8)
+
+                        end
+                    end
+
+                    b1 = A_aug[1,1]*dtron[1] + A_aug[1,2]*dtron[2] + A_aug[1,3]*dtron[3] + A_aug[1,4]*dtron[4] +
+                            A_aug[1,5]*dtron[5] + A_aug[1,6]*dtron[6] + A_aug[1,7]*dtron[7] + A_aug[1,8]*dtron[8] +
+                            (membuf[3,lineidx] - membuf[5,lineidx]*RH_1j[lineidx])*vec_1j[1] +
+                            (membuf[4,lineidx] - membuf[5,lineidx]*RH_1k[lineidx])*vec_1k[1]
+
+                    b2 = A_aug[2,1]*dtron[1] + A_aug[2,2]*dtron[2] + A_aug[2,3]*dtron[3] + A_aug[2,4]*dtron[4] +
+                            A_aug[2,5]*dtron[5] + A_aug[2,6]*dtron[6] + A_aug[2,7]*dtron[7] + A_aug[2,8]*dtron[8] +
+                            (membuf[3,lineidx] - membuf[5,lineidx]*RH_1j[lineidx])*vec_1j[2] +
+                            (membuf[4,lineidx] - membuf[5,lineidx]*RH_1k[lineidx])*vec_1k[2]
+
+                    b3 = A_aug[3,1]*dtron[1] + A_aug[3,2]*dtron[2] + A_aug[3,3]*dtron[3] + A_aug[3,4]*dtron[4] +
+                            A_aug[3,5]*dtron[5] + A_aug[3,6]*dtron[6] + A_aug[3,7]*dtron[7] + A_aug[3,8]*dtron[8] +
+                            (membuf[3,lineidx] - membuf[5,lineidx]*RH_1j[lineidx])*vec_1j[3] +
+                            (membuf[4,lineidx] - membuf[5,lineidx]*RH_1k[lineidx])*vec_1k[3] + bbr[1]
+
+                    b4 = A_aug[4,1]*dtron[1] + A_aug[4,2]*dtron[2] + A_aug[4,3]*dtron[3] + A_aug[4,4]*dtron[4] +
+                            A_aug[4,5]*dtron[5] + A_aug[4,6]*dtron[6] + A_aug[4,7]*dtron[7] + A_aug[4,8]*dtron[8] +
+                            (membuf[3,lineidx] - membuf[5,lineidx]*RH_1j[lineidx])*vec_1j[4] +
+                            (membuf[4,lineidx] - membuf[5,lineidx]*RH_1k[lineidx])*vec_1k[4] + bbr[2]
+
+                    b5 = A_aug[5,1]*dtron[1] + A_aug[5,2]*dtron[2] + A_aug[5,3]*dtron[3] + A_aug[5,4]*dtron[4] +
+                            A_aug[5,5]*dtron[5] + A_aug[5,6]*dtron[6] + A_aug[5,7]*dtron[7] + A_aug[5,8]*dtron[8] +
+                            (membuf[3,lineidx] - membuf[5,lineidx]*RH_1j[lineidx])*vec_1j[5] +
+                            (membuf[4,lineidx] - membuf[5,lineidx]*RH_1k[lineidx])*vec_1k[5] +bbr[3]
+
+                    b6 = A_aug[6,1]*dtron[1] + A_aug[6,2]*dtron[2] + A_aug[6,3]*dtron[3] + A_aug[6,4]*dtron[4] +
+                            A_aug[6,5]*dtron[5] + A_aug[6,6]*dtron[6] + A_aug[6,7]*dtron[7] + A_aug[6,8]*dtron[8] +
+                            (membuf[3,lineidx] - membuf[5,lineidx]*RH_1j[lineidx])*vec_1j[6] +
+                            (membuf[4,lineidx] - membuf[5,lineidx]*RH_1k[lineidx])*vec_1k[6] + bbr[4]
+
+                    b7 = A_aug[7,1]*dtron[1] + A_aug[7,2]*dtron[2] + A_aug[7,3]*dtron[3] + A_aug[7,4]*dtron[4] +
+                            A_aug[7,5]*dtron[5] + A_aug[7,6]*dtron[6] + A_aug[7,7]*dtron[7] + A_aug[7,8]*dtron[8] +
+                            (membuf[3,lineidx] - membuf[5,lineidx]*RH_1j[lineidx])*vec_1j[7] +
+                            (membuf[4,lineidx] - membuf[5,lineidx]*RH_1k[lineidx])*vec_1k[7] + bbr[5]
+
+                    b8 = A_aug[8,1]*dtron[1] + A_aug[8,2]*dtron[2] + A_aug[8,3]*dtron[3] + A_aug[8,4]*dtron[4] +
+                            A_aug[8,5]*dtron[5] + A_aug[8,6]*dtron[6] + A_aug[8,7]*dtron[7] + A_aug[8,8]*dtron[8] +
+                            (membuf[3,lineidx] - membuf[5,lineidx]*RH_1j[lineidx])*vec_1j[8] +
+                            (membuf[4,lineidx] - membuf[5,lineidx]*RH_1k[lineidx])*vec_1k[8] + bbr[6]
+
+
+                    for i = 1:6
+                            btron[i] = scale*(Ctron[1,i]*b1 + Ctron[2,i]*b2 + Ctron[3,i]*b3 + Ctron[4,i]*b4 + Ctron[5,i]*b5 + Ctron[6,i]*b6 +
+                                Ctron[7,i]*b7 + Ctron[8,i]*b8)
+                    end
+
+
+
+        end #inbounds
+    end #tx
+    CUDA.sync_threads()
+    return
+end
diff --git a/src/models/qpsub/qpsub_generator_kernel_cpu.jl b/src/models/qpsub/qpsub_generator_kernel_cpu.jl
new file mode 100644
index 0000000..8fdc354
--- /dev/null
+++ b/src/models/qpsub/qpsub_generator_kernel_cpu.jl
@@ -0,0 +1,15 @@
+"""
+    generator_kernel_two_level_qpsub()
+
+record cpu time: return tcpu
+"""
+
+function generator_kernel_two_level(
+    model::ModelQpsub{Float64,Array{Float64,1},Array{Int,1}},
+    baseMVA::Float64, u, xbar, zu, lu, rho_u
+)
+tcpu = @timed generator_kernel_two_level(baseMVA, model.grid_data.ngen, model.gen_start,
+u, xbar, zu, lu, rho_u, model.qpsub_pgmin, model.qpsub_pgmax, model.qpsub_qgmin, model.qpsub_qgmax,
+model.qpsub_c2, model.qpsub_c1)
+return tcpu
+end
diff --git a/src/models/qpsub/qpsub_generator_kernel_gpu.jl b/src/models/qpsub/qpsub_generator_kernel_gpu.jl
new file mode 100644
index 0000000..c9b060f
--- /dev/null
+++ b/src/models/qpsub/qpsub_generator_kernel_gpu.jl
@@ -0,0 +1,16 @@
+"""
+    generator_kernel_two_level_qpsub()
+
+record cpu time: return tcpu
+"""
+
+function generator_kernel_two_level(
+    model::ModelQpsub{Float64,CuArray{Float64,1},CuArray{Int,1}},
+    baseMVA::Float64, u, xbar, zu, lu, rho_u
+    )
+    nblk = div(model.grid_data.ngen, 32, RoundUp)
+    tgpu = CUDA.@timed @cuda threads=32 blocks=nblk generator_kernel_two_level(baseMVA, model.grid_data.ngen, model.gen_start,
+    u, xbar, zu, lu, rho_u, model.qpsub_pgmin, model.qpsub_pgmax, model.qpsub_qgmin, model.qpsub_qgmax,
+    model.qpsub_c2, model.qpsub_c1)
+    return tgpu
+end
diff --git a/src/models/qpsub/qpsub_init_solution_cpu.jl b/src/models/qpsub/qpsub_init_solution_cpu.jl
new file mode 100644
index 0000000..898c97c
--- /dev/null
+++ b/src/models/qpsub/qpsub_init_solution_cpu.jl
@@ -0,0 +1,66 @@
+"""
+    init_solution()
+
+- initialize sol.v_curr and sol.rho for all coupling
+- initialize sqp_line, supY
+"""
+
+function init_solution!(
+    model::ModelQpsub{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
+    sol::Solution{Float64,Array{Float64,1}},
+    rho_pq::Float64, rho_va::Float64
+    )
+
+    ngen = model.grid_data.ngen
+    nline = model.grid_data.nline
+    nbus = model.grid_data.nbus
+
+    sqp_line = model.sqp_line
+    ls = model.ls
+    us = model.us
+
+    brBusIdx = model.grid_data.brBusIdx
+    Vmax = model.grid_data.Vmax; Vmin = model.grid_data.Vmin
+    YffR = model.grid_data.YffR; YffI = model.grid_data.YffI
+    YttR = model.grid_data.YttR; YttI = model.grid_data.YttI
+    YftR = model.grid_data.YftR; YftI = model.grid_data.YftI
+    YtfR = model.grid_data.YtfR; YtfI = model.grid_data.YtfI
+
+    fill!(sol, 0.0)
+    fill!(model.lambda, 0.0)
+
+    #qpsub var
+    sol.rho .= rho_pq
+
+    for g=1:ngen
+        pg_idx = model.gen_start + 2*(g-1)
+        sol.v_curr[pg_idx] = 0.5*(model.qpsub_pgmin[g] + model.qpsub_pgmax[g])
+        sol.v_curr[pg_idx+1] = 0.5*(model.qpsub_qgmin[g] + model.qpsub_qgmax[g])
+    end
+
+    for l=1:nline
+
+        pij_idx = model.line_start + 8*(l-1)
+
+        supY = [YftR[l] YftI[l] YffR[l] 0 0 0;
+               -YftI[l] YftR[l] -YffI[l] 0 0 0;
+               YtfR[l] -YtfI[l] 0 YttR[l] 0 0;
+               -YtfI[l] -YtfR[l] 0 -YttI[l] 0 0] #wijR, wijI, wi, wj, theta_i, theta_j
+
+        #initialize sqp_line
+        sqp_line[:,l] = (ls[l,:] + us[l,:])/2  # order |w_ijR  | w_ijI |  wi(ij) | wj(ji) |  thetai(ij) |  thetaj(ji)|
+
+        sol.v_curr[pij_idx] = dot(supY[1,:],sqp_line[:,l])  #p_ij
+        sol.v_curr[pij_idx+1] = dot(supY[2,:],sqp_line[:,l]) #q_ij
+        sol.v_curr[pij_idx+2] = dot(supY[3,:],sqp_line[:,l]) #p_ji
+        sol.v_curr[pij_idx+3] = dot(supY[4,:],sqp_line[:,l]) #q_ji
+        sol.v_curr[pij_idx+4] = sqp_line[3,l] #w_i
+        sol.v_curr[pij_idx+5] = sqp_line[4,l] #w_j
+        sol.v_curr[pij_idx+6] = sqp_line[5,l] #theta_i
+        sol.v_curr[pij_idx+7] = sqp_line[6,l] #theta_j
+
+        sol.rho[pij_idx:pij_idx+7] .= rho_va
+    end
+
+    return
+end
diff --git a/src/models/qpsub/qpsub_init_solution_gpu.jl b/src/models/qpsub/qpsub_init_solution_gpu.jl
new file mode 100644
index 0000000..3104682
--- /dev/null
+++ b/src/models/qpsub/qpsub_init_solution_gpu.jl
@@ -0,0 +1,97 @@
+"""
+    init_solution()
+
+- initialize sol.v_curr and sol.rho for all coupling
+- initialize sqp_line, supY
+"""
+
+function init_generator_kernel_qpsub(n::Int, gen_start::Int,
+    pgmax::CuDeviceArray{Float64,1}, pgmin::CuDeviceArray{Float64,1},
+    qgmax::CuDeviceArray{Float64,1}, qgmin::CuDeviceArray{Float64,1},
+    v::CuDeviceArray{Float64,1}
+)
+    g = threadIdx().x + (blockDim().x * (blockIdx().x - 1))
+    if g <= n
+        v[gen_start + 2*(g-1)] = 0.5*(pgmin[g] + pgmax[g])
+        v[gen_start + 2*(g-1)+1] = 0.5*(qgmin[g] + qgmax[g])
+    end
+
+    return
+end
+
+
+function init_branch_bus_kernel_qpsub(n::Int, line_start::Int, rho_va::Float64,
+    YffR::CuDeviceArray{Float64,1}, YffI::CuDeviceArray{Float64,1},
+    YftR::CuDeviceArray{Float64,1}, YftI::CuDeviceArray{Float64,1},
+    YtfR::CuDeviceArray{Float64,1}, YtfI::CuDeviceArray{Float64,1},
+    YttR::CuDeviceArray{Float64,1}, YttI::CuDeviceArray{Float64,1},
+    us::CuDeviceArray{Float64,2}, ls::CuDeviceArray{Float64,2}, sqp_line::CuDeviceArray{Float64,2},
+    v::CuDeviceArray{Float64,1}, rho::CuDeviceArray{Float64,1}, supY::CuDeviceArray{Float64,2}
+)
+    l = threadIdx().x + (blockDim().x * (blockIdx().x - 1))
+    if l <= n
+        sqp_line[1,l] = (ls[l,1] + us[l,1])/2  # order |w_ijR  | w_ijI |  wi(ij) | wj(ji) |  thetai(ij) |  thetaj(ji)|
+        sqp_line[2,l] = (ls[l,2] + us[l,2])/2
+        sqp_line[3,l] = (ls[l,3] + us[l,3])/2
+        sqp_line[4,l] = (ls[l,4] + us[l,4])/2
+        sqp_line[5,l] = (ls[l,5] + us[l,5])/2
+        sqp_line[6,l] = (ls[l,6] + us[l,6])/2
+
+        pij_idx = line_start + 8*(l-1)
+
+        supY[4*(l-1) + 1,3] = YftR[l]
+        supY[4*(l-1) + 1,4] = YftI[l]
+        supY[4*(l-1) + 1,5] = YffR[l]
+        supY[4*(l-1) + 2,3] = -YftI[l]
+        supY[4*(l-1) + 2,4] = YftR[l]
+        supY[4*(l-1) + 2,5] = -YffI[l]
+        supY[4*(l-1) + 3,3] = YtfR[l]
+        supY[4*(l-1) + 3,4] = -YtfI[l]
+        supY[4*(l-1) + 3,6] = YttR[l]
+        supY[4*(l-1) + 4,3] = -YtfI[l]
+        supY[4*(l-1) + 4,4] = -YtfR[l]
+        supY[4*(l-1) + 4,6] = -YttI[l]
+
+
+        v[pij_idx] = YftR[l]*sqp_line[1,l] + YftI[l]*sqp_line[2,l] + YffR[l]*sqp_line[3,l]  #CUBLAS.dot(4, supY[1,:],sqp_line[:,l])  #p_ij
+        v[pij_idx+1] = -YftI[l]*sqp_line[1,l] + YftR[l]*sqp_line[2,l] - YffI[l]*sqp_line[3,l] #CUBLAS.dot(4, supY[2,:],sqp_line[:,l]) #q_ij
+        v[pij_idx+2] = YtfR[l]*sqp_line[1,l] - YtfI[l]*sqp_line[2,l] + YttR[l]*sqp_line[4,l] #CUBLAS.dot(4, supY[3,:],sqp_line[:,l]) #p_ji
+        v[pij_idx+3] = -YtfI[l]*sqp_line[1,l] - YtfR[l]*sqp_line[2,l] - YttI[l]*sqp_line[4,l] #CUBLAS.dot(4, supY[4,:],sqp_line[:,l]) #q_ji
+        v[pij_idx+4] = sqp_line[3,l] #w_i
+        v[pij_idx+5] = sqp_line[4,l] #w_j
+        v[pij_idx+6] = sqp_line[5,l] #theta_i
+        v[pij_idx+7] = sqp_line[6,l] #theta_j
+
+        rho[pij_idx:pij_idx+7] .= rho_va
+
+
+    end
+
+    return
+end
+
+function init_solution!(
+    model::ModelQpsub{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
+    sol::Solution{Float64,CuArray{Float64,1}},
+    rho_pq::Float64, rho_va::Float64
+    )
+
+    data = model.grid_data
+
+    fill!(sol, 0.0)
+    fill!(model.lambda, 0.0)
+
+    #qpsub var
+    sol.rho .= rho_pq
+
+
+    @cuda threads=64 blocks=(div(data.ngen-1,64)+1) init_generator_kernel_qpsub(data.ngen, model.gen_start,
+                    model.qpsub_pgmax, model.qpsub_pgmin, model.qpsub_qgmax, model.qpsub_qgmin, sol.v_curr)
+
+    @cuda threads=64 blocks=(div(data.nline-1,64)+1) init_branch_bus_kernel_qpsub(data.nline, model.line_start, rho_va,
+                    data.YffR, data.YffI, data.YftR, data.YftI,
+                    data.YtfR, data.YtfI, data.YttR, data.YttI, model.us, model.ls, model.sqp_line, sol.v_curr, sol.rho, model.supY)
+    CUDA.synchronize()
+
+    return
+end
diff --git a/src/models/qpsub/qpsub_model.jl b/src/models/qpsub/qpsub_model.jl
new file mode 100644
index 0000000..9e85e86
--- /dev/null
+++ b/src/models/qpsub/qpsub_model.jl
@@ -0,0 +1,384 @@
+"""
+    Model{T,TD,TI,TM}
+
+This contains the parameters specific to QPSUB model instance.
+"""
+
+
+"""
+    Solution Structure:
+- solution.u contain variables for generator and branch kernel
+- solution.v contains variables for bus kernel
+- Summary Table:
+
+|  dimension     |   ngen  |  ngen  |  nline    |  nline  | nline     | nline    |  nline  |  nline   | nline      | nline       |
+|:--------------:|:-------:| :----: |:----:     |:----:   |:----:     |:----:    |:----:   |:----:    |:----:      |:----:       |
+|structure for u |   pg    |  qg    |  p_ij     |  q_ij   |   p_ji    |  q_ji    | wi(ij)  |  wj(ji)  | thetai(ij) |  thetaj(ji) |
+|structure for v |   pg(i) |  qg(i) |  p_ij(i)  |  q_ij(i)|   p_ji(j) |  q_ji(j) | wi      |  wj      | thetai     | thetaj      |
+
+- structure for l and ρ is wrt all element of [x - xbar + z]  with same dimension
+
+- structure for sqp_line: 6*nline
+    |w_ijR  | w_ijI |  wi(ij) | wj(ji) |  thetai(ij) |  thetaj(ji)|
+
+- Note: line has shared nodes => xbar contain duplications.
+    For example line(1,2) and line(2,3): w2 and theta2 exist twice in v.
+
+- qpsub_membuf structure (dim = 5, used in auglag):
+    - |λ_1h | λ_1i | λ_1j| λ_1k | ρ_{1h,1i,1j,1k}|
+    - For c(x) = 0, ALM = λ*c(x) + (ρ/2)c(x)^2
+    - For 1j and 1k, introduce slack t_ij and t_ji (see internal branch structure for Exatron)
+
+"""
+mutable struct ModelQpsub{T,TD,TI,TM} <: AbstractOPFModel{T,TD,TI,TM}
+    info::IterationInformation
+    solution::AbstractSolution{T,TD}
+
+    # Used for multiple dispatch for multi-period case.
+    gen_solution::AbstractSolution{T,TD}
+
+    n::Int
+    nvar::Int
+
+    gen_start::Int
+    line_start::Int
+
+    pgmin_curr::TD   # taking ramping into account for rolling horizon
+    pgmax_curr::TD   # taking ramping into account for rolling horizon
+
+    grid_data::GridData{T,TD,TI,TM}
+
+    membuf::TM      # memory buffer for line kernel
+    gen_membuf::TM  # memory buffer for generator kernel
+
+    v_prev::TD
+
+
+
+    #qpsub_construct
+    Hs::TM  # Hessian information for all lines 6*nline x 6: |w_ijR  | w_ijI |  wi(ij) | wj(ji) |  thetai(ij) |  thetaj(ji)|
+
+    #QP coefficient for orignal branch kernel QP problem
+    LH_1h::TM  # nline * 4 : w_ijR w_ijI wi(ij) wj(ji)
+    RH_1h::TD  # nline
+    LH_1i::TM  # nline * 4 : w_ijR w_ijI thetai(ij) thetaj(ji)
+    RH_1i::TD  # nline
+
+    LH_1j::TM #nline * 2 : pij qij
+    RH_1j::TD #nline
+    LH_1k::TM #nline * 2 : pji qji
+    RH_1k::TD #nline
+
+    ls::TM # nline * 6
+    us::TM # nline * 6
+
+    is_HS_sym::Array{Bool,1} #nline
+    is_HS_PSD::Array{Bool,1} #nline
+
+    line_res::TM #4*nline
+
+    qpsub_c1::TD #ngen
+    qpsub_c2::TD #ngen
+
+    qpsub_pgmax::TD #ngen
+    qpsub_pgmin::TD #ngen
+    qpsub_qgmax::TD #ngen
+    qpsub_qgmin::TD #ngen
+
+    qpsub_Pd::TD #nbus
+    qpsub_Qd::TD #nbus
+
+
+
+    #qpsub_solve
+    qpsub_membuf::TM #memory buffer for qpsub 5*nline
+    sqp_line::TM #6 * nline
+
+    #collect qpsub solution
+    dpg_sol::Array{Float64,1} #ngen
+    dqg_sol::Array{Float64,1} #ngen
+
+    dline_var::Array{Float64,2} #6*nline: w_ijR, w_ijI, w_i, w_j, theta_i, theta_j
+    dline_fl::Array{Float64,2} #4*nline: p_ij, q_ij, p_ji, q_ji
+
+    dtheta_sol::Array{Float64,1} #nbus consensus with line_var
+    dw_sol::Array{Float64,1} #nbus consensus with line_var
+
+    # Two-Level ADMM
+    nvar_u::Int
+    nvar_v::Int
+    bus_start::Int # this is for varibles of type v.
+
+    # Padded sizes for MPI
+    nline_padded::Int
+    nvar_u_padded::Int
+    nvar_padded::Int
+
+    # for integration
+    dual_infeas::Array{Float64,1} #kkt error vector |pg |w_ijR  | w_ijI |  wi(ij) | wj(ji) |  thetai(ij) |  thetaj(ji)|
+    lambda::TM #14h i j k multiplier
+
+    # additional memory allocation for branch kernel (GPU)
+    # NOTE: added by bowen
+    supY::TM #in gpu initialization
+
+
+    function ModelQpsub{T,TD,TI,TM}() where {T, TD<:AbstractArray{T}, TI<:AbstractArray{Int}, TM<:AbstractArray{T,2}}
+        return new{T,TD,TI,TM}()
+    end
+
+    function ModelQpsub{T,TD,TI,TM}(env::AdmmEnv{T,TD,TI,TM}; ramp_ratio=0.02) where {T, TD<:AbstractArray{T}, TI<:AbstractArray{Int}, TM<:AbstractArray{T,2}}
+        model = new{T,TD,TI,TM}()
+
+        model.grid_data = GridData{T,TD,TI,TM}(env)
+
+        model.n = (env.use_linelimit == true) ? 6 : 4 # branch kernel size (use linelimt or not)
+        model.nline_padded = model.grid_data.nline
+
+        # Memory space is padded for the lines as a multiple of # processes.
+        if env.use_mpi
+            nprocs = MPI.Comm_size(env.comm)
+            model.nline_padded = nprocs * div(model.grid_data.nline, nprocs, RoundUp)
+        end
+
+        model.nvar = 2*model.grid_data.ngen + 8*model.grid_data.nline
+        model.nvar_padded = model.nvar + 8*(model.nline_padded - model.grid_data.nline) #useless
+        model.gen_start = 1 #location starting generator variables
+        model.line_start = 2*model.grid_data.ngen + 1 #location starting branch variables
+
+
+        model.pgmin_curr = TD(undef, model.grid_data.ngen)
+        model.pgmax_curr = TD(undef, model.grid_data.ngen)
+        copyto!(model.pgmin_curr, model.grid_data.pgmin)
+        copyto!(model.pgmax_curr, model.grid_data.pgmax)
+
+        model.grid_data.ramp_rate = TD(undef, model.grid_data.ngen)
+        model.grid_data.ramp_rate .= ramp_ratio.*model.grid_data.pgmax
+
+        #scale the obj params with obj_scale
+        if env.params.obj_scale != 1.0
+            model.grid_data.c2 .*= env.params.obj_scale
+            model.grid_data.c1 .*= env.params.obj_scale
+            model.grid_data.c0 .*= env.params.obj_scale
+            model.Hs .*=env.params.obj_scale
+        end
+
+        # These are only for two-level ADMM.
+        model.nvar_u = 2*model.grid_data.ngen + 8*model.grid_data.nline
+        model.nvar_u_padded = model.nvar_u + 8*(model.nline_padded - model.grid_data.nline)
+        model.nvar_v = 2*model.grid_data.ngen + 4*model.grid_data.nline + 2*model.grid_data.nbus
+        model.bus_start = 2*model.grid_data.ngen + 4*model.grid_data.nline + 1
+
+        # Memory space is allocated based on the padded size.
+        model.solution = Solution{T,TD}(model.nvar_padded)
+
+        model.gen_solution = EmptyGeneratorSolution{T,TD}()
+
+        #old memory buffer used in the auglag_linelimit with Tron
+        model.membuf = TM(undef, (31, model.grid_data.nline))
+        fill!(model.membuf, 0.0)
+        model.membuf[29,:] .= model.grid_data.rateA
+
+        model.info = IterationInformation{ComponentInformation}()
+
+        #new solution structure for tron (Hessian inherited from SQP: 6*nline)
+        model.sqp_line = TM(undef, (6,model.grid_data.nline))
+        fill!(model.sqp_line, 0.0)
+
+        #new v_prev for dual reshape
+        model.v_prev = TD(undef, model.nvar)
+        fill!(model.v_prev, 0.0)
+
+        #new memory buffer used in the new auglag_Ab with Tron
+        model.qpsub_membuf = TM(undef, (5,model.grid_data.nline))
+        fill!(model.qpsub_membuf, 0.0)
+
+        #new qpsub parameters
+        model.Hs = TM(undef,(6*model.grid_data.nline,6))
+        fill!(model.Hs, 0.0)
+
+        model.line_res = TM(undef,(4,model.grid_data.nline))
+        fill!(model.line_res, 0.0)
+
+        #1h
+        model.LH_1h = TM(undef,(model.grid_data.nline,4))
+        fill!(model.LH_1h, 0.0)
+
+        model.RH_1h = TD(undef,model.grid_data.nline)
+        fill!(model.RH_1h, 0.0)
+
+        #1i
+        model.LH_1i = TM(undef,(model.grid_data.nline,4))
+        fill!(model.LH_1i, 0.0)
+
+        model.RH_1i = TD(undef,model.grid_data.nline)
+        fill!(model.RH_1i, 0.0)
+
+        #1j
+        model.LH_1j = TM(undef,(model.grid_data.nline,2))
+        fill!(model.LH_1j, 0.0)
+
+        model.RH_1j = TD(undef,model.grid_data.nline)
+        fill!(model.RH_1j, 0.0)
+
+        #1k
+        model.LH_1k = TM(undef,(model.grid_data.nline,2))
+        fill!(model.LH_1k, 0.0)
+
+        model.RH_1k = TD(undef,model.grid_data.nline)
+        fill!(model.RH_1k, 0.0)
+
+        # l and u
+        model.ls = TM(undef,(model.grid_data.nline,6))
+        fill!(model.ls, 0.0)
+
+        model.us = TM(undef,(model.grid_data.nline,6))
+        fill!(model.us, 0.0)
+
+        model.is_HS_sym = Array{Bool,1}(undef, model.grid_data.nline)
+        fill!(model.is_HS_sym, true)
+
+        model.is_HS_PSD = Array{Bool,1}(undef, model.grid_data.nline)
+        fill!(model.is_HS_PSD, true)
+
+        model.qpsub_c1 = TD(undef, model.grid_data.ngen)
+        fill!(model.qpsub_c1, 0.0)
+
+        model.qpsub_c2 = TD(undef, model.grid_data.ngen)
+        fill!(model.qpsub_c2, 0.0)
+
+        model.qpsub_pgmax = TD(undef, model.grid_data.ngen)
+        fill!(model.qpsub_pgmax, 0.0)
+
+        model.qpsub_pgmin = TD(undef, model.grid_data.ngen)
+        fill!(model.qpsub_pgmin, 0.0)
+
+        model.qpsub_qgmax = TD(undef, model.grid_data.ngen)
+        fill!(model.qpsub_qgmax, 0.0)
+
+        model.qpsub_qgmin = TD(undef, model.grid_data.ngen)
+        fill!(model.qpsub_qgmin, 0.0)
+
+        model.qpsub_Pd = TD(undef, model.grid_data.nbus)
+        fill!(model.qpsub_Pd, 0.0)
+
+        model.qpsub_Qd = TD(undef, model.grid_data.nbus)
+        fill!(model.qpsub_Qd, 0.0)
+
+        # qpsub solution
+        model.dpg_sol = Array{Float64,1}(undef, model.grid_data.ngen)
+        fill!(model.dpg_sol, 0.0)
+
+        model.dqg_sol = Array{Float64,1}(undef, model.grid_data.ngen)
+        fill!(model.dqg_sol, 0.0)
+
+        model.dline_var = Array{Float64,2}(undef,(6, model.grid_data.nline))
+        fill!(model.dline_var, 0.0)
+
+        model.dline_fl = Array{Float64,2}(undef,(4, model.grid_data.nline))
+        fill!(model.dline_fl, 0.0)
+
+        model.dtheta_sol = Array{Float64,1}(undef, model.grid_data.nbus)
+        fill!(model.dtheta_sol, 0.0)
+
+        model.dw_sol = Array{Float64,1}(undef, model.grid_data.nbus)
+        fill!(model.dw_sol, 0.0)
+
+        # for SQP
+        model.dual_infeas = Array{Float64,1}(undef, model.grid_data.ngen + 6*model.grid_data.nline)
+        fill!(model.dual_infeas, 1000.0)
+
+        model.lambda = TM(undef, (4, model.grid_data.nline))
+        fill!(model.lambda, 0.0)
+
+        #additional options
+        model.supY = TM(undef, (4*model.grid_data.nline, 8)) #see supY def in eval_A_b_branch_kernel_gpu_qpsub
+        fill!(model.supY, 0.0)
+
+
+        return model
+    end
+end
+
+"""
+This is to share power network data between models. Some fields that could be modified are deeply copied.
+"""
+function Base.copy(ref::ModelQpsub{T,TD,TI,TM}) where {T, TD<:AbstractArray{T}, TI<:AbstractArray{Int}, TM<:AbstractArray{T,2}}
+    model = ModelQpsub{T,TD,TI,TM}()
+
+    model.solution = copy(ref.solution)
+    model.gen_solution = copy(ref.gen_solution)
+    model.info = copy(ref.info)
+    model.grid_data = copy(ref.grid_data)
+
+    model.n = ref.n
+    model.nvar = ref.nvar
+
+    model.gen_start = ref.gen_start
+    model.line_start = ref.line_start
+
+    model.pgmin_curr = copy(ref.pgmin_curr)
+    model.pgmax_curr = copy(ref.pgmax_curr)
+
+    model.membuf = copy(ref.membuf)
+    model.gen_membuf = copy(ref.gen_membuf)
+    model.qpsub_membuf = copy(ref.qpsub_membuf)
+
+    model.nvar_u = ref.nvar_u
+    model.nvar_v = ref.nvar_v
+    model.bus_start = ref.bus_start
+
+    model.nline_padded = ref.nline_padded
+    model.nvar_padded = ref.nvar_padded
+    model.nvar_u_padded = ref.nvar_u_padded
+
+    model.Hs = copy(ref.Hs)
+    model.LH_1h = copy(ref.LH_1h)
+    model.RH_1h = copy(ref.RH_1h)
+
+    model.LH_1i = copy(ref.LH_1i)
+    model.RH_1i = copy(ref.RH_1i)
+
+    model.LH_1j = copy(ref.LH_1j)
+    model.RH_1j = copy(ref.RH_1j)
+
+    model.LH_1k = copy(ref.LH_1k)
+    model.RH_1k = copy(ref.RH_1k)
+
+    model.ls = copy(ref.ls)
+    model.us = copy(ref.us)
+
+    model.line_res = copy(ref.line_res)
+
+    model.sqp_line = copy(ref.sqp_line)
+
+    model.dpg_sol = copy(ref.dpg_sol)
+    model.dqg_sol = copy(ref.dqg_sol)
+    model.dline_var = copy(ref.dline_var)
+    model.dline_fl = copy(ref.dline_fl)
+    model.dtheta_sol = copy(ref.dtheta_sol)
+    model.dw_sol = copy(ref.dw_sol)
+
+    model.is_HS_sym = copy(ref.is_HS_sym)
+    model.is_HS_PSD = copy(ref.is_HS_PSD)
+
+    model.qpsub_c1 = copy(ref.qpsub_c1)
+    model.qpsub_c2 = copy(ref.qpsub_c2)
+
+    model.qpsub_pgmax = copy(ref.qpsub_pgmax)
+    model.qpsub_pgmin = copy(ref.qpsub_pgmin)
+    model.qpsub_qgmax = copy(ref.qpsub_qgmax)
+    model.qpsub_qgmin = copy(ref.qpsub_qgmin)
+
+    model.qpsub_Pd = copy(ref.qpsub_Pd)
+    model.qpsub_Qd = copy(ref.qpsub_Qd)
+
+    # for SQP
+    model.dual_infeas = copy(ref.dual_infeas)
+    model.lambda = copy(ref.lambda)
+
+    #additional parameters
+    model.supY = copy(ref.supY)
+
+    return model
+end
diff --git a/src/models/qpsub/qpsub_tron_linelimit_kernel.jl b/src/models/qpsub/qpsub_tron_linelimit_kernel.jl
new file mode 100644
index 0000000..67c0150
--- /dev/null
+++ b/src/models/qpsub/qpsub_tron_linelimit_kernel.jl
@@ -0,0 +1,170 @@
+"""
+Driver to run TRON on GPU. This should be called from a kernel.
+
+    tron_gpu_test()
+- solves bounded QP: 1/2 x^THx + b*x s.t xl <= x <= xu
+- includes eval_f_kernel(), eval_g_kernel(), and eval_h_kernel
+"""
+@inline function tron_gpu_test(n::Int, H::CuDeviceArray{Float64,2}, b::CuDeviceArray{Float64,1}, x::CuDeviceArray{Float64,1}, xl::CuDeviceArray{Float64,1}, xu::CuDeviceArray{Float64,1})
+    tx = threadIdx().x
+
+
+    g = CuDynamicSharedArray(Float64, n, (3*n + (2*n + n^2 + 178))*sizeof(Float64))
+    xc = CuDynamicSharedArray(Float64, n, (4*n + (2*n + n^2 + 178))*sizeof(Float64))
+    s = CuDynamicSharedArray(Float64, n, (5*n + (2*n + n^2 + 178))*sizeof(Float64))
+    wa = CuDynamicSharedArray(Float64, n, (6*n + (2*n + n^2 + 178))*sizeof(Float64))
+    wa1 = CuDynamicSharedArray(Float64, n, (7*n + (2*n + n^2 + 178))*sizeof(Float64))
+    wa2 = CuDynamicSharedArray(Float64, n, (8*n + (2*n + n^2 + 178))*sizeof(Float64))
+    wa3 = CuDynamicSharedArray(Float64, n, (9*n + (2*n + n^2 + 178))*sizeof(Float64))
+    wa4 = CuDynamicSharedArray(Float64, n, (10*n + (2*n + n^2 + 178))*sizeof(Float64))
+    wa5 = CuDynamicSharedArray(Float64, n, (11*n + (2*n + n^2 + 178))*sizeof(Float64))
+    gfree = CuDynamicSharedArray(Float64, n, (12*n + (2*n + n^2 + 178))*sizeof(Float64))
+    dsave = CuDynamicSharedArray(Float64, n, (13*n + (2*n + n^2 + 178))*sizeof(Float64))
+    indfree = CuDynamicSharedArray(Int, n, (14*n + (2*n + n^2 + 178))*sizeof(Float64))
+    iwa = CuDynamicSharedArray(Int, 2*n, n*sizeof(Int) + (14*n + (2*n + n^2 + 178))*sizeof(Float64))
+    isave = CuDynamicSharedArray(Int, n, (3*n)*sizeof(Int) + (14*n + (2*n + n^2 + 178))*sizeof(Float64))
+
+
+    A = CuDynamicSharedArray(Float64, (n,n), (14*n + (2*n + n^2 + 178))*sizeof(Float64)+(4*n)*sizeof(Int))
+    B = CuDynamicSharedArray(Float64, (n,n), (14*n+n^2 + (2*n + n^2 + 178))*sizeof(Float64)+(4*n)*sizeof(Int))
+    L = CuDynamicSharedArray(Float64, (n,n), (14*n+2*n^2 + (2*n + n^2 + 178))*sizeof(Float64)+(4*n)*sizeof(Int))
+
+    if tx <= n
+        @inbounds for j=1:n
+            A[tx,j] = 0.0
+            B[tx,j] = 0.0
+            L[tx,j] = 0.0
+        end
+    end
+    CUDA.sync_threads()
+
+    max_feval = 500
+    max_minor = 200
+    gtol = 1e-6
+
+    task = 0
+    status = 0
+
+    delta = 0.0
+    fatol = 0.0
+    frtol = 1e-12
+    fmin = -1e32
+    cgtol = 0.1
+    cg_itermax = n
+
+    f = 0.0
+    nfev = 0
+    ngev = 0
+    nhev = 0
+    minor_iter = 0
+    search = true
+
+    while search
+
+        if task == 0 || task == 1
+
+            f = eval_f_kernel(x,H,b)
+            nfev += 1
+            if nfev >= max_feval
+                search = false
+            end
+        end
+
+        # [2] G or H: Evaluate gradient and Hessian.
+
+        if task == 0 || task == 2
+            eval_g_kernel(x,g,H,b)
+            eval_h_kernel(A,H)
+            ngev += 1
+            nhev += 1
+            minor_iter += 1
+        end
+
+        # Initialize the trust region bound.
+
+        if task == 0
+            gnorm0 = ExaAdmm.ExaTron.dnrm2(n, g, 1)
+            delta = gnorm0
+        end
+
+        if search
+            delta, task = ExaAdmm.ExaTron.dtron(n, x, xl, xu, f, g, A, frtol, fatol, fmin, cgtol,
+                                        cg_itermax, delta, task, B, L, xc, s, indfree, gfree,
+                                        isave, dsave, wa, iwa, wa1, wa2, wa3, wa4, wa5)
+        end
+
+         # [3] NEWX: a new point was computed.
+
+        if task == 3
+            gnorm_inf = ExaAdmm.ExaTron.dgpnorm(n, x, xl, xu, g)
+
+            if gnorm_inf <= gtol
+                task = 4
+            end
+
+            if minor_iter >= max_minor
+                status = 1
+                search = false
+            end
+        end
+
+        # [4] CONV: convergence was achieved.
+        # [10] : warning fval is less than fmin
+
+        if task == 4 || task == 10
+            search = false
+        end
+     end # end while
+
+
+    CUDA.sync_threads()
+
+    return status, minor_iter
+end
+
+@inline function eval_f_kernel(x::CuDeviceArray{Float64,1},A::CuDeviceArray{Float64,2},b::CuDeviceArray{Float64,1}) #f gpu
+    f = 0.0
+        @inbounds begin
+            for i = 1:6
+                for j = 1:6
+                    f += 0.5*x[i]*A[i,j]*x[j]
+                end
+                    f += b[i]*x[i]
+            end
+        end
+    CUDA.sync_threads()
+    return f
+end
+
+@inline function eval_g_kernel(x::CuDeviceArray{Float64,1}, g::CuDeviceArray{Float64,1}, A::CuDeviceArray{Float64,2}, b::CuDeviceArray{Float64,1})
+    tx = threadIdx().x
+    g1 = A[1,1]*x[1] + A[1,2]*x[2] + A[1,3]*x[3] + A[1,4]*x[4] + A[1,5]*x[5] + A[1,6]*x[6] + b[1]
+    g2 = A[2,1]*x[1] + A[2,2]*x[2] + A[2,3]*x[3] + A[2,4]*x[4] + A[2,5]*x[5] + A[2,6]*x[6] + b[2]
+    g3 = A[3,1]*x[1] + A[3,2]*x[2] + A[3,3]*x[3] + A[3,4]*x[4] + A[3,5]*x[5] + A[3,6]*x[6] + b[3]
+    g4 = A[4,1]*x[1] + A[4,2]*x[2] + A[4,3]*x[3] + A[4,4]*x[4] + A[4,5]*x[5] + A[4,6]*x[6] + b[4]
+    g5 = A[5,1]*x[1] + A[5,2]*x[2] + A[5,3]*x[3] + A[5,4]*x[4] + A[5,5]*x[5] + A[5,6]*x[6] + b[5]
+    g6 = A[6,1]*x[1] + A[6,2]*x[2] + A[6,3]*x[3] + A[6,4]*x[4] + A[6,5]*x[5] + A[6,6]*x[6] + b[6]
+    if tx == 1
+        g[1] = g1
+        g[2] = g2
+        g[3] = g3
+        g[4] = g4
+        g[5] = g5
+        g[6] = g6
+    end
+    CUDA.sync_threads()
+end
+
+@inline function eval_h_kernel(A::CuDeviceArray{Float64,2}, H::CuDeviceArray{Float64,2})
+    tx = threadIdx().x
+    if tx == 1
+        @inbounds begin
+            for i = 1:6
+                for j = 1:6
+                    A[i,j] = H[i,j]
+                end
+            end
+        end
+    end
+    CUDA.sync_threads()
+end
diff --git a/src/not_used/check_violations.jl b/src/not_used/check_violations.jl
deleted file mode 100644
index e5ba462..0000000
--- a/src/not_used/check_violations.jl
+++ /dev/null
@@ -1,440 +0,0 @@
-function check_generator_bounds(model::Model, v::Vector{Float64})
-    ngen = model.ngen
-    gen_start = model.gen_start
-
-    pgmax = model.pgmax_curr; pgmin = model.pgmin_curr
-    qgmax = model.qgmax; qgmin = model.qgmin
-
-    max_viol_real = 0.0
-    max_viol_reactive = 0.0
-
-    for g=1:ngen
-        pidx = gen_start + 2*(g-1)
-        qidx = gen_start + 2*(g-1) + 1
-
-        real_err = max(max(0.0, v[pidx] - pgmax[g]), max(0.0, pgmin[g] - v[pidx]))
-        reactive_err = max(max(0.0, v[qidx] - qgmax[g]), max(0.0, qgmin[g] - v[qidx]))
-
-        max_viol_real = (max_viol_real < real_err) ? real_err : max_viol_real
-        max_viol_reactive = (max_viol_reactive < reactive_err) ? reactive_err : max_viol_reactive
-    end
-
-    return max_viol_real, max_viol_reactive
-end
-
-function check_voltage_bounds_alternative(model::Model, v::Vector{Float64})
-    max_viol = 0.0
-
-    for b=1:model.nbus
-        if model.FrStart[b] < model.FrStart[b+1]
-            l = model.FrIdx[model.FrStart[b]]
-            wi = v[model.line_start + 8*(l-1) + 4]
-        elseif model.ToStart[b] < model.ToStart[b+1]
-            l = model.ToIdx[model.ToStart[b]]
-            wi = v[model.line_start + 8*(l-1) + 5]
-        else
-            println("No lines connected to bus ", b)
-        end
-
-        err = max(max(0.0, wi - model.Vmax[b]^2), max(0.0, model.Vmin[b]^2 - wi))
-        max_viol = (max_viol < err) ? err : max_viol
-    end
-
-    return max_viol
-end
-
-function check_power_balance_alternative(model::Model, u::Vector{Float64}, v::Vector{Float64})
-    baseMVA = model.baseMVA
-    nbus = model.nbus
-    gen_start, line_start = model.gen_start, model.line_start
-
-    YffR, YffI = model.YffR, model.YffI
-    YftR, YftI = model.YftR, model.YftI
-    YtfR, YtfI = model.YtfR, model.YtfI
-    YttR, YttI = model.YttR, model.YttI
-    YshR, YshI = model.YshR, model.YshI
-
-    vm = [0.0 for i=1:nbus]
-    va = [0.0 for i=1:nbus]
-
-    for b=1:nbus
-        count = 0
-        for k=model.FrStart[b]:model.FrStart[b+1]-1
-            l = model.FrIdx[k]
-            vm[b] = sqrt(v[line_start + 8*(l-1) + 4])
-            va[b] = v[line_start + 8*(l-1) + 6]
-            count += 1
-        end
-        for k=model.ToStart[b]:model.ToStart[b+1]-1
-            l = model.ToIdx[k]
-            vm[b] = sqrt(v[line_start + 8*(l-1) + 5])
-            va[b] = v[line_start + 8*(l-1) + 7]
-            count += 1
-        end
-        #vm[b] /= count
-        #va[b] /= count
-    end
-
-    max_viol_real = 0.0
-    max_viol_reactive = 0.0
-    max_viol_reactive_idx = -1
-    for b=1:nbus
-        real_err = 0.0
-        reactive_err = 0.0
-        for k=model.GenStart[b]:model.GenStart[b+1]-1
-            g = model.GenIdx[k]
-            real_err += v[gen_start + 2*(g-1)]
-            reactive_err += v[gen_start + 2*(g-1)+1]
-        end
-
-        real_err -= (model.Pd[b] / baseMVA)
-        reactive_err -= (model.Qd[b] / baseMVA)
-
-        for k=model.FrStart[b]:model.FrStart[b+1]-1
-            l = model.FrIdx[k]
-            i = model.brBusIdx[1+2*(l-1)]
-            j = model.brBusIdx[1+2*(l-1)+1]
-            pij = YffR[l]*vm[i]^2 + YftR[l]*vm[i]*vm[j]*cos(va[i] - va[j]) + YftI[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-            qij = -YffI[l]*vm[i]^2 - YftI[l]*vm[i]*vm[j]*cos(va[i] - va[j]) + YftR[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-            real_err -= pij
-            reactive_err -= qij
-        end
-
-        for k=model.ToStart[b]:model.ToStart[b+1]-1
-            l = model.ToIdx[k]
-            i = model.brBusIdx[1+2*(l-1)]
-            j = model.brBusIdx[1+2*(l-1)+1]
-            pji = YttR[l]*vm[j]^2 + YtfR[l]*vm[i]*vm[j]*cos(va[i] - va[j]) - YtfI[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-            qji = -YttI[l]*vm[j]^2 - YtfI[l]*vm[i]*vm[j]*cos(va[i] - va[j]) - YtfR[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-            real_err -= pji
-            reactive_err -= qji
-        end
-
-        real_err -= YshR[b] * vm[b]^2
-        reactive_err += YshI[b] * vm[b]^2
-
-        if max_viol_reactive < abs(reactive_err)
-            max_viol_reactive_idx = b
-        end
-        max_viol_real = (max_viol_real < abs(real_err)) ? abs(real_err) : max_viol_real
-        max_viol_reactive = (max_viol_reactive < abs(reactive_err)) ? abs(reactive_err) : max_viol_reactive
-    end
-
-    return max_viol_real, max_viol_reactive
-end
-
-function check_linelimit_violation(model::Model, data::OPFData, v::Vector{Float64})
-    lines = data.lines
-    nbus = length(data.buses)
-    nline = length(data.lines)
-    line_start = 2*length(data.generators) + 1
-
-    YffR, YffI = model.YffR, model.YffI
-    YftR, YftI = model.YftR, model.YftI
-    YtfR, YtfI = model.YtfR, model.YtfI
-    YttR, YttI = model.YttR, model.YttI
-
-    rateA_nviols = 0
-    rateA_maxviol = 0.0
-
-    vm = [0.0 for i=1:nbus]
-    va = [0.0 for i=1:nbus]
-
-    for b=1:nbus
-        count = 0
-        for k=model.FrStart[b]:model.FrStart[b+1]-1
-            l = model.FrIdx[k]
-            vm[b] = sqrt(v[line_start + 8*(l-1) + 4])
-            va[b] = v[line_start + 8*(l-1) + 6]
-            count += 1
-        end
-        for k=model.ToStart[b]:model.ToStart[b+1]-1
-            l = model.ToIdx[k]
-            vm[b] = sqrt(v[line_start + 8*(l-1) + 5])
-            va[b] = v[line_start + 8*(l-1) + 7]
-            count += 1
-        end
-        #vm[b] /= count
-        #va[b] /= count
-    end
-
-    for l=1:nline
-        i = model.brBusIdx[1+2*(l-1)]
-        j = model.brBusIdx[1+2*(l-1)+1]
-        pij = YffR[l]*vm[i]^2 + YftR[l]*vm[i]*vm[j]*cos(va[i] - va[j]) + YftI[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-        qij = -YffI[l]*vm[i]^2 - YftI[l]*vm[i]*vm[j]*cos(va[i] - va[j]) + YftR[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-        pji = YttR[l]*vm[j]^2 + YtfR[l]*vm[i]*vm[j]*cos(va[i] - va[j]) - YtfI[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-        qji = -YttI[l]*vm[j]^2 - YtfI[l]*vm[i]*vm[j]*cos(va[i] - va[j]) - YtfR[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-
-        ij_val = sqrt(pij^2 + qij^2)
-        ji_val = sqrt(pji^2 + qji^2)
-
-        limit = lines[l].rateA / data.baseMVA
-        if limit > 0
-            if ij_val > limit || ji_val > limit
-                rateA_nviols += 1
-                rateA_maxviol = max(rateA_maxviol, max(ij_val - limit, ji_val - limit))
-            end
-        end
-    end
-
-    return rateA_nviols, rateA_maxviol
-end
-
-function check_generator_bounds(model::ComplementarityModel, v::Vector{Float64})
-    grid = model.grid
-    ngen = grid.ngen
-    gen_start = model.gen_start
-
-    pgmax = model.pgmax_curr; pgmin = model.pgmin_curr
-    qgmax = grid.qgmax; qgmin = grid.qgmin
-
-    max_viol_real = 0.0
-    max_viol_reactive = 0.0
-
-    for g=1:ngen
-        pidx = gen_start + 2*(g-1)
-        qidx = gen_start + 2*(g-1) + 1
-
-        real_err = max(max(0.0, v[pidx] - pgmax[g]), max(0.0, pgmin[g] - v[pidx]))
-        reactive_err = max(max(0.0, v[qidx] - qgmax[g]), max(0.0, qgmin[g] - v[qidx]))
-
-        max_viol_real = (max_viol_real < real_err) ? real_err : max_viol_real
-        max_viol_reactive = (max_viol_reactive < reactive_err) ? reactive_err : max_viol_reactive
-    end
-
-    return max_viol_real, max_viol_reactive
-end
-
-function check_voltage_bounds_alternative(model::ComplementarityModel, v::Vector{Float64})
-    grid = model.grid
-    max_viol = 0.0
-
-    for b=1:grid.nbus
-        if grid.FrStart[b] < grid.FrStart[b+1]
-            l = grid.FrIdx[grid.FrStart[b]]
-            wi = v[model.line_start + 8*(l-1) + 4]
-        elseif grid.ToStart[b] < grid.ToStart[b+1]
-            l = grid.ToIdx[grid.ToStart[b]]
-            wi = v[model.line_start + 8*(l-1) + 5]
-        else
-            println("No lines connected to bus ", b)
-        end
-
-        err = max(max(0.0, wi - grid.Vmax[b]^2), max(0.0, grid.Vmin[b]^2 - wi))
-        max_viol = (max_viol < err) ? err : max_viol
-    end
-
-    return max_viol
-end
-
-function check_power_balance_alternative(model::ComplementarityModel, u::Vector{Float64}, v::Vector{Float64})
-    grid = model.grid
-    baseMVA = grid.baseMVA
-    nbus = grid.nbus
-    gen_start, line_start = model.gen_start, model.line_start
-
-    YffR, YffI = grid.YffR, grid.YffI
-    YftR, YftI = grid.YftR, grid.YftI
-    YtfR, YtfI = grid.YtfR, grid.YtfI
-    YttR, YttI = grid.YttR, grid.YttI
-    YshR, YshI = grid.YshR, grid.YshI
-
-    vm = [0.0 for i=1:nbus]
-    va = [0.0 for i=1:nbus]
-
-    for b=1:nbus
-        count = 0
-        for k=grid.FrStart[b]:grid.FrStart[b+1]-1
-            l = grid.FrIdx[k]
-            vm[b] = sqrt(v[line_start + 8*(l-1) + 4])
-            va[b] = v[line_start + 8*(l-1) + 6]
-            count += 1
-        end
-        for k=grid.ToStart[b]:grid.ToStart[b+1]-1
-            l = grid.ToIdx[k]
-            vm[b] = sqrt(v[line_start + 8*(l-1) + 5])
-            va[b] = v[line_start + 8*(l-1) + 7]
-            count += 1
-        end
-        #vm[b] /= count
-        #va[b] /= count
-    end
-
-    max_viol_real = 0.0
-    max_viol_reactive = 0.0
-    max_viol_reactive_idx = -1
-    for b=1:nbus
-        real_err = 0.0
-        reactive_err = 0.0
-        for k=grid.GenStart[b]:grid.GenStart[b+1]-1
-            g = grid.GenIdx[k]
-            real_err += v[gen_start + 2*(g-1)]
-            reactive_err += v[gen_start + 2*(g-1)+1]
-        end
-
-        real_err -= (grid.Pd[b] / baseMVA)
-        reactive_err -= (grid.Qd[b] / baseMVA)
-
-        for k=grid.FrStart[b]:grid.FrStart[b+1]-1
-            l = grid.FrIdx[k]
-            i = grid.brBusIdx[1+2*(l-1)]
-            j = grid.brBusIdx[1+2*(l-1)+1]
-            pij = YffR[l]*vm[i]^2 + YftR[l]*vm[i]*vm[j]*cos(va[i] - va[j]) + YftI[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-            qij = -YffI[l]*vm[i]^2 - YftI[l]*vm[i]*vm[j]*cos(va[i] - va[j]) + YftR[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-            real_err -= pij
-            reactive_err -= qij
-        end
-
-        for k=grid.ToStart[b]:grid.ToStart[b+1]-1
-            l = grid.ToIdx[k]
-            i = grid.brBusIdx[1+2*(l-1)]
-            j = grid.brBusIdx[1+2*(l-1)+1]
-            pji = YttR[l]*vm[j]^2 + YtfR[l]*vm[i]*vm[j]*cos(va[i] - va[j]) - YtfI[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-            qji = -YttI[l]*vm[j]^2 - YtfI[l]*vm[i]*vm[j]*cos(va[i] - va[j]) - YtfR[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-            real_err -= pji
-            reactive_err -= qji
-        end
-
-        real_err -= YshR[b] * vm[b]^2
-        reactive_err += YshI[b] * vm[b]^2
-
-        if max_viol_reactive < abs(reactive_err)
-            max_viol_reactive_idx = b
-        end
-        max_viol_real = (max_viol_real < abs(real_err)) ? abs(real_err) : max_viol_real
-        max_viol_reactive = (max_viol_reactive < abs(reactive_err)) ? abs(reactive_err) : max_viol_reactive
-    end
-
-    return max_viol_real, max_viol_reactive
-end
-
-function check_linelimit_violation(model::ComplementarityModel, data::OPFData, v::Vector{Float64})
-    grid = model.grid
-    lines = data.lines
-    nbus = grid.nbus
-    nline = grid.nline
-    line_start = model.line_start
-
-    YffR, YffI = grid.YffR, grid.YffI
-    YftR, YftI = grid.YftR, grid.YftI
-    YtfR, YtfI = grid.YtfR, grid.YtfI
-    YttR, YttI = grid.YttR, grid.YttI
-
-    rateA_nviols = 0
-    rateA_maxviol = 0.0
-
-    vm = [0.0 for i=1:nbus]
-    va = [0.0 for i=1:nbus]
-
-    for b=1:nbus
-        count = 0
-        for k=grid.FrStart[b]:grid.FrStart[b+1]-1
-            l = grid.FrIdx[k]
-            vm[b] += sqrt(v[line_start + 8*(l-1) + 4])
-            va[b] += v[line_start + 8*(l-1) + 6]
-            count += 1
-        end
-        for k=grid.ToStart[b]:grid.ToStart[b+1]-1
-            l = grid.ToIdx[k]
-            vm[b] += sqrt(v[line_start + 8*(l-1) + 5])
-            va[b] += v[line_start + 8*(l-1) + 7]
-            count += 1
-        end
-        vm[b] /= count
-        va[b] /= count
-    end
-
-    for l=1:nline
-        i = grid.brBusIdx[1+2*(l-1)]
-        j = grid.brBusIdx[1+2*(l-1)+1]
-        pij = YffR[l]*vm[i]^2 + YftR[l]*vm[i]*vm[j]*cos(va[i] - va[j]) + YftI[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-        qij = -YffI[l]*vm[i]^2 - YftI[l]*vm[i]*vm[j]*cos(va[i] - va[j]) + YftR[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-        pji = YttR[l]*vm[j]^2 + YtfR[l]*vm[i]*vm[j]*cos(va[i] - va[j]) - YtfI[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-        qji = -YttI[l]*vm[j]^2 - YtfI[l]*vm[i]*vm[j]*cos(va[i] - va[j]) - YtfR[l]*vm[i]*vm[j]*sin(va[i] - va[j])
-
-        ij_val = sqrt(pij^2 + qij^2)
-        ji_val = sqrt(pji^2 + qji^2)
-
-        limit = lines[l].rateA / data.baseMVA
-        if limit > 0
-            if ij_val > limit || ji_val > limit
-                rateA_nviols += 1
-                rateA_maxviol = max(rateA_maxviol, max(ij_val - limit, ji_val - limit))
-            end
-        end
-    end
-
-    return rateA_nviols, rateA_maxviol
-end
-
-#=
-function check_power_balance_alternative(model::Model, u::Vector{Float64}, v::Vector{Float64})
-    baseMVA = model.baseMVA
-    nbus = model.nbus
-    gen_start, line_start, YshR, YshI = model.gen_start, model.line_start, model.YshR, model.YshI
-
-    max_viol_real = 0.0
-    max_viol_reactive = 0.0
-    for b=1:nbus
-        real_err = 0.0
-        reactive_err = 0.0
-        for k=model.GenStart[b]:model.GenStart[b+1]-1
-            g = model.GenIdx[k]
-            real_err += u[gen_start + 2*(g-1)]
-            reactive_err += u[gen_start + 2*(g-1)+1]
-        end
-
-        real_err -= (model.Pd[b] / baseMVA)
-        reactive_err -= (model.Qd[b] / baseMVA)
-
-        wi = 0
-        for k=model.FrStart[b]:model.FrStart[b+1]-1
-            l = model.FrIdx[k]
-            real_err -= v[line_start + 8*(l-1)]
-            reactive_err -= v[line_start + 8*(l-1) + 1]
-            wi = v[line_start + 8*(l-1) + 4]
-        end
-
-        for k=model.ToStart[b]:model.ToStart[b+1]-1
-            l = model.ToIdx[k]
-            real_err -= v[line_start + 8*(l-1) + 2]
-            reactive_err -= v[line_start + 8*(l-1) + 3]
-            wi = v[line_start + 8*(l-1) + 5]
-        end
-
-        real_err -= YshR[b] * wi
-        reactive_err += YshI[b] * wi
-
-        max_viol_real = (max_viol_real < abs(real_err)) ? abs(real_err) : max_viol_real
-        max_viol_reactive = (max_viol_reactive < abs(reactive_err)) ? abs(reactive_err) : max_viol_reactive
-    end
-
-    return max_viol_real, max_viol_reactive
-end
-
-function check_linelimit_violation(data::OPFData, v::Vector{Float64})
-    lines = data.lines
-    nline = length(data.lines)
-    line_start = 2*length(data.generators) + 1
-
-    rateA_nviols = 0
-    rateA_maxviol = 0.0
-
-    for l=1:nline
-        pij_idx = line_start + 8*(l-1)
-        ij_val = sqrt(v[pij_idx]^2 + v[pij_idx+1]^2)
-        ji_val = sqrt(v[pij_idx+2]^2 + v[pij_idx+3]^2)
-
-        limit = lines[l].rateA / data.baseMVA
-        if limit > 0
-            if ij_val > limit || ji_val > limit
-                rateA_nviols += 1
-                rateA_maxviol = max(rateA_maxviol, max(ij_val - limit, ji_val - limit))
-            end
-        end
-    end
-
-    return rateA_nviols, rateA_maxviol
-end
-=#
diff --git a/src/not_used/mpacopf_check_violations.jl b/src/not_used/mpacopf_check_violations.jl
deleted file mode 100644
index 3fa8475..0000000
--- a/src/not_used/mpacopf_check_violations.jl
+++ /dev/null
@@ -1,8 +0,0 @@
-function check_ramp_violations(mod::Model, u_curr::Vector{Float64}, u_prev::Vector{Float64}, ramp_rate::Vector{Float64})
-    max_viol = 0.0
-    for g=1:mod.grid_data.ngen
-        pg_idx = mod.gen_start + 2*(g-1)
-        max_viol = max(max_viol, max(0.0, -(ramp_rate[g]-abs(u_curr[pg_idx]-u_prev[pg_idx]))))
-    end
-    return max_viol
-end
\ No newline at end of file
diff --git a/src/utils/environment.jl b/src/utils/environment.jl
index adbf4f5..20908d0 100644
--- a/src/utils/environment.jl
+++ b/src/utils/environment.jl
@@ -9,34 +9,36 @@ mutable struct Parameters
     ABSTOL::Float64
     RELTOL::Float64
 
-    rho_max::Float64    # TODO: not used
-    rho_min_pq::Float64 # TODO: not used
-    rho_min_w::Float64  # TODO: not used
-    eps_rp::Float64     # TODO: not used
-    eps_rp_min::Float64 # TODO: not used
-    rt_inc::Float64     # TODO: not used
-    rt_dec::Float64     # TODO: not used
-    eta::Float64        # TODO: not used
+    rho_max::Float64    #? not used
+    rho_min_pq::Float64 #? not used
+    rho_min_w::Float64  #? not used
+    eps_rp::Float64     #? not used
+    eps_rp_min::Float64 #? not used
+    rt_inc::Float64     #? not used
+    rt_dec::Float64     #? not used
+    eta::Float64        #? not used
     verbose::Int
 
     # MPI implementation
     shift_lines::Int
 
     # Two-Level ADMM
-    initial_beta::Float64
-    beta::Float64
-    inc_c::Float64
-    theta::Float64
+    initial_beta::Float64 #initial penalty for z when outer_iter = 1
+    beta::Float64 #updated penalty for z
+    inc_c::Float64 #increase rate on beta
+    theta::Float64 #dynamic tolerance rate for norm_z_curr
     outer_eps::Float64
-    Kf::Int             # TODO: not used
-    Kf_mean::Int        # TODO: not used
+    shmem_size::Int
+    gen_shmem_size::Int
+    Kf::Int             #? not used
+    Kf_mean::Int        #? not used
     MAX_MULTIPLIER::Float64
     DUAL_TOL::Float64
 
     outer_iterlim::Int
     inner_iterlim::Int
-    scale::Float64
-    obj_scale::Float64
+    scale::Float64 #scale Exatron eval_f, eval_g, eval_h
+    obj_scale::Float64 #scale obj c2, c1, c0
 
     function Parameters()
         par = new()
@@ -475,7 +477,7 @@ mutable struct ComplementarityModel{T,TD,TI,TM} <: AbstractOPFModel{T,TD,TI,TM}
         model.line_start = 4*model.grid.ngen + model.grid.nstorage + 1
 #        model.line_start = 3*model.grid.ngen + 1
 
-        model.solution = SolutionOneLevel{T,TD}(model.nvar_padded)
+        model.solution = Solution{T,TD}(model.nvar_padded)
         init_solution!(model, model.solution, env.initial_rho_pq, env.initial_rho_va)
         model.gen_solution = EmptyGeneratorSolution{T,TD}()
 
diff --git a/test/algorithms/qpsub_update_cpu.jl b/test/algorithms/qpsub_update_cpu.jl
new file mode 100644
index 0000000..f01acca
--- /dev/null
+++ b/test/algorithms/qpsub_update_cpu.jl
@@ -0,0 +1,239 @@
+
+    
+case = joinpath(INSTANCES_DIR, "case9.m")
+rho_pq = 20.0 #for two level 
+rho_va = 20.0 #for two level
+initial_beta = 100000.0 #for two level
+verbose = 0
+    
+    
+# Initialize an qpsub model with default options as shell for qpsub.
+    T = Float64; TD = Array{Float64,1}; TI = Array{Int,1}; TM = Array{Float64,2}
+
+    env1 = ExaAdmm.AdmmEnv{T,TD,TI,TM}(case, rho_pq, rho_va; verbose=verbose)
+    mod1 = ExaAdmm.ModelQpsub{T,TD,TI,TM}(env1)
+    sol = mod1.solution
+    par = env1.params
+    data = mod1.grid_data
+
+    pg = [0.8409840263442788, 1.3754683725459356, 0.9683515492257173]
+    qg = [0.1339605333132542, 0.004939066051572686, -0.22522034650172912]
+    line_var = [1.202283876496575 1.1854600895233969 1.1882407994325652 1.1939274133312217 1.197851907882065 1.1976278981260697 1.2041729663674776 1.1724433539605357 1.1711057984763673; 0.048440679917430465 0.02887729733428878 -0.1001598280647882 0.05674540078462704 0.03747916294088837 -0.0440601695432392 -0.08596677328412097 0.11837782697248565 -0.04146540783240968; 1.2100000032154183 1.1965562109975751 1.1751637818852145 1.1807295010262204 1.2100000106478777 1.1869866679607968 1.2100000106803914 1.2100000106803914 1.1476336495105626; 1.1965562109975751 1.1751637818852145 1.2100000106478777 1.2100000106478777 1.1869866679607968 1.2100000106803914 1.204481657995701 1.1476336495105626 1.1965562109975751; 3.25491739361945e-19 -0.04026877067245247 -0.064623523567375 0.06696282911866715 0.01947021797594214 -0.011808222565670412 0.024964724778954814 0.024964724778954814 -0.07566104101514581; -0.04026877067245247 -0.064623523567375 0.01947021797594214 0.01947021797594214 -0.011808222565670412 0.024964724778954814 0.0962345286951661 -0.07566104101514581 -0.04026877067245247]
+    line_fl = [0.8409840263442788 0.3250708808197873 -0.5764825106130851 0.9683515492257173 0.3807383869887371 -0.6207434318226953 -1.3754683725459356 0.7519258919133632 -0.5132124444249647; 0.1339605333132542 -0.03398537001400827 -0.1550260795071022 -0.22522034650172912 -0.05087639039200361 -0.1629431594683535 0.09323270900662127 -0.10130995083516688 -0.3167567555281643; -0.8409840263442788 -0.323517489386915 0.5876131622369802 -0.9683515492257173 -0.37925656817730474 0.6235424806325723 1.3754683725459356 -0.7367875555750353 0.5159131455244915; -0.09943863713541591 -0.14497392049289778 -0.2234000143290868 0.2742764047210904 -0.18705684053164653 0.008077241828545615 0.004939066051572686 -0.1832432444718357 0.13342400714942418]
+    pgb = [0.8409840263442788, 1.3754683725459356, 0.9683515492257173, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    pft = [0.8409840263442788, -3.009265538105056e-36, 0.9683515492257173, 0.3250708808197873, -0.5764825106130851, 0.3807383869887371, -0.6207434318226953, -0.6235424806325723, -0.5132124444249647]
+    ptf = [0.0, 1.3754683725459356, 0.0, -0.3250708808197873, -0.323517489386915, -0.3807383869887371, -0.37925656817730474, 0.6235424806325723, -0.7367875555750353]
+    qgb = [0.1339605333132542, 0.004939066051572686, -0.22522034650172912, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    qft = [0.1339605333132542, 0.0, -0.22522034650172912, -0.03398537001400827, -0.1550260795071022, -0.05087639039200361, -0.1629431594683535, -0.008077241828545615, -0.3167567555281643]
+    qtf = [0.0, 0.004939066051572686, 0.0, 0.03398537001400827, -0.14497392049289778, 0.05087639039200361, -0.18705684053164653, 0.008077241828545615, -0.1832432444718357]
+    bus_w = [1.2100000032154183, 1.204481657995701, 1.1807295010262204, 1.1965562109975751, 1.1751637818852145, 1.2100000106478777, 1.1869866679607968, 1.2100000106803914, 1.1476336495105626]
+
+    # save variable to Hs, 1h, 1j, 1i, 1k, new bound, new cost 
+    @inbounds begin 
+        pi_14 = -ones(4,data.nline) #set multiplier for the hessian evaluation 14h 14i 14j 14k
+        is_Hs_sym = zeros(data.nline) #is Hs symmetric
+        is_Hs_PSD = zeros(data.nline) #is Hs positive semidefinite 
+    
+    
+        #gen bound
+        mod1.qpsub_pgmax .= data.pgmax - (pg)  
+        mod1.qpsub_pgmin .= data.pgmin - (pg)
+        mod1.qpsub_qgmax .= data.qgmax - (qg)
+        mod1.qpsub_qgmin .= data.qgmin - (qg)
+    
+        #new cost coeff
+        mod1.qpsub_c1 = data.c1 + 2*data.c2.*(pg)
+        mod1.qpsub_c2 = data.c2
+    
+        #w theta bound
+        for l = 1: data.nline
+            mod1.ls[l,1] = -2*data.FrVmBound[2*l]*data.ToVmBound[2*l] #wijR lb
+            mod1.us[l,1] = 2*data.FrVmBound[2*l]*data.ToVmBound[2*l] #wijR ub
+            mod1.ls[l,2] = -2*data.FrVmBound[2*l]*data.ToVmBound[2*l] #wijI lb
+            mod1.us[l,2] = 2*data.FrVmBound[2*l]*data.ToVmBound[2*l] #wijI ub
+    
+            mod1.ls[l,3] = data.FrVmBound[1+2*(l-1)]^2 - (line_var)[3,l] #wi lb
+            mod1.us[l,3] = data.FrVmBound[2*l]^2 - (line_var)[3,l] #wi ub
+            mod1.ls[l,4] = data.ToVmBound[1+2*(l-1)]^2 - (line_var)[4,l] #wj lb
+            mod1.us[l,4] = data.ToVmBound[2*l]^2 - (line_var)[4,l] #wj ub
+    
+            mod1.ls[l,5] = data.FrVaBound[1+2*(l-1)] - (line_var)[5,l] #ti lb
+            mod1.us[l,5] = data.FrVaBound[2*l] - (line_var)[5,l] #ti ub
+            mod1.ls[l,6] = data.ToVaBound[1+2*(l-1)] - (line_var)[6,l] #tj lb
+            mod1.us[l,6] = data.ToVaBound[2*l] - (line_var)[6,l] #tj ub
+        end
+    
+        for b = 1:data.nbus
+            mod1.qpsub_Pd[b] = data.baseMVA * (data.Pd[b]/data.baseMVA - ((pgb)[b] - (pft)[b] - (ptf)[b] - data.YshR[b]*(bus_w)[b]))
+            mod1.qpsub_Qd[b] = data.baseMVA * (data.Qd[b]/data.baseMVA - ((qgb)[b] - (qft)[b] - (qtf)[b] + data.YshI[b]*(bus_w)[b]))
+        end
+    
+        for l = 1: data.nline
+            
+            #Hs:(6,6) #w_ijR, w_ijI, w_i, w_j, theta_i, theta_j
+            Hs = zeros(6,6)
+    
+            Hs_14h = zeros(6,6)
+            Hs_14h[1,1] = 2*pi_14[1,l]
+            Hs_14h[2,2] = 2*pi_14[1,l] 
+            Hs_14h[3,4] = -pi_14[1,l] 
+            Hs_14h[4,3] = -pi_14[1,l] 
+    
+            Hs_14i = zeros(6,6)
+            cons_1 = pi_14[2,l]*cos((line_var)[5,l] - (line_var)[6,l])
+            cons_2 = pi_14[2,l]*sin((line_var)[5,l] - (line_var)[6,l])
+            cons_3 = pi_14[2,l]*(-(line_var)[1,l]*sin((line_var)[5,l] - (line_var)[6,l]) +  (line_var)[1,2]*cos((line_var)[5,l] - (line_var)[6,l]))
+        
+            Hs_14i[1,5] = cons_1 #wijR theta_i
+            Hs_14i[5,1] = cons_1 #wijR theta_i
+            Hs_14i[1,6] = -cons_1 #wijR theta_j
+            Hs_14i[6,1] = -cons_1 #wijR theta_j 
+    
+            Hs_14i[2,5] = cons_2 #wijR theta_i
+            Hs_14i[5,2] = cons_2 #wijR theta_i
+            Hs_14i[2,6] = -cons_2 #wijR theta_j
+            Hs_14i[6,2] = -cons_2 #wijR theta_j 
+    
+            Hs_14i[5,5] = cons_3 #thetai thetai
+            Hs_14i[6,6] = cons_3 #thetaj thetaj
+            Hs_14i[5,6] = -cons_3 #thetai thetaj
+            Hs_14i[6,5] = -cons_3 #thetaj thetai 
+            
+            supY = [data.YftR[l] data.YftI[l] data.YffR[l] 0 0 0;
+                -data.YftI[l] data.YftR[l] -data.YffI[l] 0 0 0;
+                data.YtfR[l] -data.YtfI[l] 0 data.YttR[l] 0 0;
+                -data.YtfI[l] -data.YtfR[l] 0 -data.YttI[l] 0 0]
+            Hs_14j = -2*pi_14[3,l]*(supY[1,:]*transpose(supY[1,:]) + supY[2,:]*transpose(supY[2,:]) )
+            Hs_14k = -2*pi_14[4,l]*(supY[3,:]*transpose(supY[3,:]) + supY[4,:]*transpose(supY[4,:]) )
+            Hs .= Hs_14h + Hs_14i + Hs_14j + Hs_14k + UniformScaling(4)
+            mod1.Hs[6*(l-1)+1:6*l,1:6] .= Hs
+    
+            is_Hs_sym[l] = maximum(abs.(Hs - transpose(Hs)))
+            @assert is_Hs_sym[l] <= 1e-6
+            eival, eivec = eigen(Hs)
+            is_Hs_PSD[l] = minimum(eival)
+            @assert is_Hs_PSD[l] >= 0.0
+    
+            #inherit structure of Linear Constraint (overleaf): ignore 1h and 1i with zero assignment in ipopt benchmark
+            LH_1h = [2*(line_var[1,l]), 2*(line_var[2,l]), -(line_var[4,l]), -(line_var[3,l])] #LH * x = RH
+            mod1.LH_1h[l,:] .= LH_1h
+            RH_1h = -((line_var)[1,l])^2 - ((line_var)[2,l])^2 + (line_var)[3,l]*(line_var)[4,l] 
+            mod1.RH_1h[l] = RH_1h
+    
+            LH_1i = [sin((line_var)[5,l] - (line_var)[6,l]), -cos((line_var)[5,l] - (line_var)[6,l]), 
+            (line_var)[1,l]*cos((line_var)[5,l] - (line_var)[6,l]) +  (line_var)[2,l]*sin((line_var)[5,l] - (line_var)[6,l]),
+            -(line_var)[1,l]*cos((line_var)[5,l] - (line_var)[6,l]) -  (line_var)[2,l]*sin((line_var)[5,l] - (line_var)[6,l])] #Lf * x = RH
+            mod1.LH_1i[l,:] .= LH_1i
+            RH_1i = -(line_var)[1,l]*sin((line_var)[5,l] - (line_var)[6,l])  +  (line_var)[2,l]*cos((line_var)[5,l] - (line_var)[6,l])
+            mod1.RH_1i[l] = RH_1i
+    
+            #inherit structure line limit constraint (overleaf)
+            LH_1j = [2*(line_fl)[1,l], 2*(line_fl)[2,l]] #rand(2)
+            mod1.LH_1j[l,:] .= LH_1j
+            RH_1j = -(((line_fl)[1,l])^2 + ((line_fl)[2,l])^2 - data.rateA[l]) 
+            mod1.RH_1j[l] = RH_1j
+    
+            LH_1k = [2*(line_fl)[3,l], 2*(line_fl)[4,l]] #zeros(2) #rand(2)
+            mod1.LH_1k[l,:] .= LH_1k
+            RH_1k = -(((line_fl)[3,l])^2 + ((line_fl)[4,l])^2 - data.rateA[l]) 
+            mod1.RH_1k[l] = RH_1k
+        end
+    end #inbound  
+
+# test one-iteration update    
+@testset "Testing [x,xbar,l,residual] updates" begin     
+    atol = 2e-6
+    env2 = ExaAdmm.AdmmEnv{T,TD,TI,TM}(case, rho_pq, rho_va; verbose=verbose)
+    mod2 = ExaAdmm.ModelQpsub{T,TD,TI,TM}(env2)
+    sol2 = mod2.solution
+    par2 = env2.params
+    data2 = mod2.grid_data
+    info2 = mod2.info
+    
+    env2.params.scale = 1e-4
+    env2.params.initial_beta = 1e3
+    env2.params.beta = 1e3
+    
+    mod2.Hs .= mod1.Hs
+    mod2.LH_1h .= mod1.LH_1h
+    mod2.RH_1h .= mod1.RH_1h
+    mod2.LH_1i .= mod1.LH_1i
+    mod2.RH_1i .= mod1.RH_1i
+    mod2.LH_1j .= mod1.LH_1j
+    mod2.RH_1j .= mod1.RH_1j
+    mod2.LH_1k .= mod1.LH_1k
+    mod2.RH_1k .= mod1.RH_1k
+    mod2.ls .= mod1.ls
+    mod2.us .= mod1.us
+    
+    mod2.qpsub_pgmax .= mod1.qpsub_pgmax
+    mod2.qpsub_pgmin .= mod1.qpsub_pgmin
+    mod2.qpsub_qgmax .= mod1.qpsub_qgmax
+    mod2.qpsub_qgmin .= mod1.qpsub_qgmin
+
+    mod2.qpsub_c1 .= mod1.qpsub_c1
+    mod2.qpsub_c2 .= mod1.qpsub_c2
+    mod2.qpsub_Pd .= mod1.qpsub_Pd
+    mod2.qpsub_Qd .= mod1.qpsub_Qd
+    
+    ExaAdmm.init_solution!(mod2, mod2.solution, env2.initial_rho_pq, env2.initial_rho_va)
+    
+    info2.norm_z_prev = info2.norm_z_curr = 0
+    par2.initial_beta = 0 
+    par2.beta = 0
+    sol2.lz .= 0
+    sol2.z_curr .= 0
+    sol2.z_prev .= 0
+    par2.inner_iterlim = 1
+    
+    
+    ExaAdmm.admm_increment_outer(env2, mod2)
+    ExaAdmm.admm_outer_prestep(env2, mod2)
+    ExaAdmm.admm_increment_reset_inner(env2, mod2)
+    ExaAdmm.admm_increment_inner(env2, mod2)
+    ExaAdmm.admm_inner_prestep(env2, mod2)
+    
+    
+    ExaAdmm.admm_update_x(env2, mod2)
+    U_SOL = [-0.229424, -0.1339605, -0.0813327, -0.0049391, -0.0465958, 0.2252203, -0.1236772, -0.1271249, 0.1236772, 0.1189746, -0.1182455, -0.1040701, -0.0, 0.0022044, -0.0630247, -0.1092983, 0.0622891, 0.1082608, -0.0297814, -0.0074735, 0.0422504, 0.0451598, 0.0984247, 0.0700557, -0.102193, -0.0905333, 0.0294319, -0.0067947, 0.0250279, 0.0125998, -0.1368894, 0.2542204, 0.1368894, -0.2698603, -0.0770438, -0.1077549, -0.0375397, -0.0344878, -0.0665263, -0.1146067, 0.0658612, 0.1071325, -0.0032826, 0.0208988, -0.0055371, -0.0008479, 0.1049601, 0.1480269, -0.1061072, -0.1593811, 0.0230133, -0.0010432, -0.0026878, -0.0082759, 0.2045771, -0.05922, -0.2045771, 0.0340703, -0.0553384, -0.0495078, -0.0467405, -0.0542589, -0.1322638, -0.1856806, 0.1264451, 0.1533923, -0.0223818, 0.0420754, 0.0141644, 0.0280826, 0.0937455, 0.2743344, -0.0957246, -0.2938647, 0.0406687, -0.0099012, 0.0507601, 0.0458481]
+    @test norm( sol2.u_curr .- U_SOL, Inf) <= atol
+    
+    
+    
+    ExaAdmm.admm_update_xbar(env2, mod2)
+    V_SOL = [-0.1765506, -0.1305427, -0.1429549, 0.0145656, -0.0917426, 0.2397204, -0.1765506, -0.1305427, 0.1353679, 0.2137041, -0.1182455, -0.0479176, -0.0, 0.030101, -0.051334, -0.0145688, -0.0180678, 0.0191025, -0.0479176, 0.0109792, 0.030101, 0.0350939, 0.0180678, -0.0191025, -0.091583, 0.0678001, 0.0109792, -0.0392774, 0.0350939, -0.0091417, -0.0917426, 0.2397204, 0.1474993, -0.1115269, -0.0770438, -0.0392774, -0.0375397, -0.0091417, -0.0559163, 0.0437267, -0.0195494, -0.0204472, -0.0392774, 0.0219561, -0.0091417, -0.0017678, 0.0195494, 0.0204472, -0.0948426, -0.0246205, 0.0219561, -0.0262545, -0.0017678, -0.0136173, 0.2158417, 0.0755405, -0.1429549, 0.0145656, -0.0262545, -0.0495078, -0.0136173, -0.0542589, -0.1209991, -0.05092, 0.0163498, -0.0604711, -0.0262545, 0.041372, -0.0136173, 0.0394213, -0.0163498, 0.0604711, -0.0840339, -0.1991353, 0.041372, -0.0479176, 0.0394213, 0.030101]
+    @test norm( sol2.v_curr .- V_SOL, Inf) <= atol
+    
+
+
+    
+    ExaAdmm.admm_update_l_single(env2, mod2)
+    L_SOL = [-1.057468, -0.0683565, 1.2324431, -0.390094, 0.9029359, -0.2900002, 1.057468, 0.0683565, -0.2338139, -1.8945894, 0.0, -1.1230512, 0.0, -0.5579311, -0.2338139, -1.8945894, 1.6071386, 1.7831649, 0.362723, -0.3690544, 0.2429884, 0.201319, 1.6071386, 1.7831649, -0.2121989, -3.166669, 0.3690544, 0.6496544, -0.201319, 0.4348308, -0.9029359, 0.2900002, -0.2121989, -3.166669, 0.0, -1.3695503, 0.0, -0.5069225, -0.2121989, -3.166669, 1.7082129, 2.5515935, 0.7198958, -0.0211449, 0.0720918, 0.0183997, 1.7082129, 2.5515935, -0.2252933, -2.6952112, 0.0211449, 0.5042264, -0.0183997, 0.1068284, -0.2252933, -2.6952112, -1.2324431, 0.390094, -0.5816791, 0.0, -0.6624625, 0.0, -0.2252933, -2.6952112, 2.2019063, 4.277267, 0.0774527, 0.0140663, 0.5556341, -0.2267749, 2.2019063, 4.277267, -0.2338139, -1.8945894, -0.0140663, 0.7603282, 0.2267749, 0.3149427]
+    @test norm( sol2.l_curr .- L_SOL, Inf) <= atol
+    
+    
+    
+    ExaAdmm.admm_update_residual(env2, mod2)
+    RP_SOL = [-0.0528734, -0.0034178, 0.0616222, -0.0195047, 0.0451468, -0.0145, 0.0528734, 0.0034178, -0.0116907, -0.0947295, 0.0, -0.0561526, 0.0, -0.0278966, -0.0116907, -0.0947295, 0.0803569, 0.0891582, 0.0181362, -0.0184527, 0.0121494, 0.0100659, 0.0803569, 0.0891582, -0.0106099, -0.1583335, 0.0184527, 0.0324827, -0.0100659, 0.0217415, -0.0451468, 0.0145, -0.0106099, -0.1583335, 0.0, -0.0684775, 0.0, -0.0253461, -0.0106099, -0.1583335, 0.0854106, 0.1275797, 0.0359948, -0.0010572, 0.0036046, 0.00092, 0.0854106, 0.1275797, -0.0112647, -0.1347606, 0.0010572, 0.0252113, -0.00092, 0.0053414, -0.0112647, -0.1347606, -0.0616222, 0.0195047, -0.029084, 0.0, -0.0331231, 0.0, -0.0112647, -0.1347606, 0.1100953, 0.2138633, 0.0038726, 0.0007033, 0.0277817, -0.0113387, 0.1100953, 0.2138633, -0.0116907, -0.0947295, -0.0007033, 0.0380164, 0.0113387, 0.0157471]
+    RD_SOL = [-12.7113319, 0.0683565, -6.3497307, 0.390094, -10.4678211, 0.2900002, -3.5310125, 66.8335931, 2.7073584, 69.0505462, 1.6350909, 2.7727729, 0.0, -0.2033556, 6.2198775, 38.6305254, 6.0542368, 34.8407725, 2.7727729, 3.5228599, -0.2033556, -0.5905927, 4.596186, 17.4861768, 3.2963767, 22.9929842, 3.5228599, 3.2144525, -0.5905927, 0.2065699, -1.8348521, 63.0638615, 2.9499869, 66.0288518, 1.8737146, 3.2144525, 0.5884628, 0.2065699, 3.5020235, 39.5936187, 3.6977135, 33.8548621, 3.2144525, 3.9788551, 0.2065699, -0.2715214, 6.1151706, 48.6324263, 4.5716387, 54.0015076, 3.9788551, 3.4749108, -0.2715214, 0.2269477, 4.3168345, 65.510814, -2.8590981, 62.5254434, 3.4749108, 2.8994775, 0.2269477, 0.8395123, 2.3304349, 22.2701391, 3.5960831, 14.8170115, 3.4749108, 3.580114, 0.2269477, -0.7247947, 3.4309189, 32.9094673, 3.4130062, 38.9852785, 3.580114, 2.7727729, -0.7247947, -0.2033556]
+
+    @test norm( sol2.rp .- RP_SOL, Inf) <= atol
+    @test norm( sol2.rd .- RD_SOL, Inf) <= atol
+
+    
+end #@testset
+
+
+#test solution 
+@testset "Qpsub ADMM (CPU) vs IPOPT" begin
+    env3, mod3 = ExaAdmm.solve_qpsub(case, mod1.Hs, mod1.LH_1h, mod1.RH_1h,
+    mod1.LH_1i, mod1.RH_1i, mod1.LH_1j, mod1.RH_1j, mod1.LH_1k, mod1.RH_1k, mod1.ls, mod1.us, mod1.qpsub_pgmax, mod1.qpsub_pgmin, mod1.qpsub_qgmax, mod1.qpsub_qgmin, mod1.qpsub_c1, mod1.qpsub_c2, mod1.qpsub_Pd, mod1.qpsub_Qd,
+    initial_beta; 
+        outer_iterlim=10000, inner_iterlim=1, scale = 1e-4, obj_scale = 1, rho_pq = 4000.0, rho_va = 4000.0, verbose=0, outer_eps=2*1e-6, onelevel = true)
+
+  
+    @test mod3.info.status == :Solved
+    @test mod3.info.outer == 5107
+    @test mod3.info.cumul == 5107
+    @test isapprox(mod3.info.objval, -21.92744641968529; atol=1e-6)     
+end 
\ No newline at end of file
diff --git a/test/algorithms/qpsub_update_gpu.jl b/test/algorithms/qpsub_update_gpu.jl
new file mode 100644
index 0000000..49a7ae4
--- /dev/null
+++ b/test/algorithms/qpsub_update_gpu.jl
@@ -0,0 +1,358 @@
+@testset "Testing [x,xbar,l,res] updates and a solve for ACOPF" begin
+
+rho_pq = 20.0 #for two level 
+rho_va = 20.0 #for two level
+initial_beta = 100000.0 #for two level
+atol = 2e-6
+verbose = 0
+use_gpu = true
+
+case = joinpath(INSTANCES_DIR, "case9.m")
+
+# Initialize an cpu qpsub model with default options as shell for qpsub.
+    T = Float64; TD = Array{Float64,1}; TI = Array{Int,1}; TM = Array{Float64,2}
+
+    env1 = ExaAdmm.AdmmEnv{T,TD,TI,TM}(case, rho_pq, rho_va; verbose=verbose)
+    mod1 = ExaAdmm.ModelQpsub{T,TD,TI,TM}(env1)
+    sol = mod1.solution
+    par = env1.params
+    data = mod1.grid_data
+    pg = [0.8409840263442788, 1.3754683725459356, 0.9683515492257173]
+    qg = [0.1339605333132542, 0.004939066051572686, -0.22522034650172912]
+    line_var = [1.202283876496575 1.1854600895233969 1.1882407994325652 1.1939274133312217 1.197851907882065 1.1976278981260697 1.2041729663674776 1.1724433539605357 1.1711057984763673; 0.048440679917430465 0.02887729733428878 -0.1001598280647882 0.05674540078462704 0.03747916294088837 -0.0440601695432392 -0.08596677328412097 0.11837782697248565 -0.04146540783240968; 1.2100000032154183 1.1965562109975751 1.1751637818852145 1.1807295010262204 1.2100000106478777 1.1869866679607968 1.2100000106803914 1.2100000106803914 1.1476336495105626; 1.1965562109975751 1.1751637818852145 1.2100000106478777 1.2100000106478777 1.1869866679607968 1.2100000106803914 1.204481657995701 1.1476336495105626 1.1965562109975751; 3.25491739361945e-19 -0.04026877067245247 -0.064623523567375 0.06696282911866715 0.01947021797594214 -0.011808222565670412 0.024964724778954814 0.024964724778954814 -0.07566104101514581; -0.04026877067245247 -0.064623523567375 0.01947021797594214 0.01947021797594214 -0.011808222565670412 0.024964724778954814 0.0962345286951661 -0.07566104101514581 -0.04026877067245247]
+    line_fl = [0.8409840263442788 0.3250708808197873 -0.5764825106130851 0.9683515492257173 0.3807383869887371 -0.6207434318226953 -1.3754683725459356 0.7519258919133632 -0.5132124444249647; 0.1339605333132542 -0.03398537001400827 -0.1550260795071022 -0.22522034650172912 -0.05087639039200361 -0.1629431594683535 0.09323270900662127 -0.10130995083516688 -0.3167567555281643; -0.8409840263442788 -0.323517489386915 0.5876131622369802 -0.9683515492257173 -0.37925656817730474 0.6235424806325723 1.3754683725459356 -0.7367875555750353 0.5159131455244915; -0.09943863713541591 -0.14497392049289778 -0.2234000143290868 0.2742764047210904 -0.18705684053164653 0.008077241828545615 0.004939066051572686 -0.1832432444718357 0.13342400714942418]
+    pgb = [0.8409840263442788, 1.3754683725459356, 0.9683515492257173, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    pft = [0.8409840263442788, -3.009265538105056e-36, 0.9683515492257173, 0.3250708808197873, -0.5764825106130851, 0.3807383869887371, -0.6207434318226953, -0.6235424806325723, -0.5132124444249647]
+    ptf = [0.0, 1.3754683725459356, 0.0, -0.3250708808197873, -0.323517489386915, -0.3807383869887371, -0.37925656817730474, 0.6235424806325723, -0.7367875555750353]
+    qgb = [0.1339605333132542, 0.004939066051572686, -0.22522034650172912, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    qft = [0.1339605333132542, 0.0, -0.22522034650172912, -0.03398537001400827, -0.1550260795071022, -0.05087639039200361, -0.1629431594683535, -0.008077241828545615, -0.3167567555281643]
+    qtf = [0.0, 0.004939066051572686, 0.0, 0.03398537001400827, -0.14497392049289778, 0.05087639039200361, -0.18705684053164653, 0.008077241828545615, -0.1832432444718357]
+    bus_w = [1.2100000032154183, 1.204481657995701, 1.1807295010262204, 1.1965562109975751, 1.1751637818852145, 1.2100000106478777, 1.1869866679607968, 1.2100000106803914, 1.1476336495105626]
+
+
+    # save variable to Hs, 1h, 1j, 1i, 1k, new bound, new cost 
+    @inbounds begin 
+        pi_14 = -ones(4,data.nline) #set multiplier for the hessian evaluation 14h 14i 14j 14k
+        is_Hs_sym = zeros(data.nline) #is Hs symmetric
+        is_Hs_PSD = zeros(data.nline) #is Hs positive semidefinite 
+    
+    
+        #gen bound
+        mod1.qpsub_pgmax .= data.pgmax - (pg)  
+        mod1.qpsub_pgmin .= data.pgmin - (pg)
+        mod1.qpsub_qgmax .= data.qgmax - (qg)
+        mod1.qpsub_qgmin .= data.qgmin - (qg)
+    
+        #new cost coeff
+        mod1.qpsub_c1 = data.c1 + 2*data.c2.*(pg)
+        mod1.qpsub_c2 = data.c2
+    
+        #w theta bound
+        for l = 1: data.nline
+            mod1.ls[l,1] = -2*data.FrVmBound[2*l]*data.ToVmBound[2*l] #wijR lb
+            mod1.us[l,1] = 2*data.FrVmBound[2*l]*data.ToVmBound[2*l] #wijR ub
+            mod1.ls[l,2] = -2*data.FrVmBound[2*l]*data.ToVmBound[2*l] #wijI lb
+            mod1.us[l,2] = 2*data.FrVmBound[2*l]*data.ToVmBound[2*l] #wijI ub
+    
+            mod1.ls[l,3] = data.FrVmBound[1+2*(l-1)]^2 - (line_var)[3,l] #wi lb
+            mod1.us[l,3] = data.FrVmBound[2*l]^2 - (line_var)[3,l] #wi ub
+            mod1.ls[l,4] = data.ToVmBound[1+2*(l-1)]^2 - (line_var)[4,l] #wj lb
+            mod1.us[l,4] = data.ToVmBound[2*l]^2 - (line_var)[4,l] #wj ub
+    
+            mod1.ls[l,5] = data.FrVaBound[1+2*(l-1)] - (line_var)[5,l] #ti lb
+            mod1.us[l,5] = data.FrVaBound[2*l] - (line_var)[5,l] #ti ub
+            mod1.ls[l,6] = data.ToVaBound[1+2*(l-1)] - (line_var)[6,l] #tj lb
+            mod1.us[l,6] = data.ToVaBound[2*l] - (line_var)[6,l] #tj ub
+        end
+    
+        for b = 1:data.nbus
+            mod1.qpsub_Pd[b] = data.baseMVA * (data.Pd[b]/data.baseMVA - ((pgb)[b] - (pft)[b] - (ptf)[b] - data.YshR[b]*(bus_w)[b]))
+            mod1.qpsub_Qd[b] = data.baseMVA * (data.Qd[b]/data.baseMVA - ((qgb)[b] - (qft)[b] - (qtf)[b] + data.YshI[b]*(bus_w)[b]))
+        end
+    
+        for l = 1: data.nline
+            #Hs:(6,6) #w_ijR, w_ijI, w_i, w_j, theta_i, theta_j
+            Hs = zeros(6,6)
+    
+            Hs_14h = zeros(6,6)
+            Hs_14h[1,1] = 2*pi_14[1,l]
+            Hs_14h[2,2] = 2*pi_14[1,l] 
+            Hs_14h[3,4] = -pi_14[1,l] 
+            Hs_14h[4,3] = -pi_14[1,l] 
+    
+            Hs_14i = zeros(6,6)
+            cons_1 = pi_14[2,l]*cos((line_var)[5,l] - (line_var)[6,l])
+            cons_2 = pi_14[2,l]*sin((line_var)[5,l] - (line_var)[6,l])
+            cons_3 = pi_14[2,l]*(-(line_var)[1,l]*sin((line_var)[5,l] - (line_var)[6,l]) +  (line_var)[1,2]*cos((line_var)[5,l] - (line_var)[6,l]))
+        
+            Hs_14i[1,5] = cons_1 #wijR theta_i
+            Hs_14i[5,1] = cons_1 #wijR theta_i
+            Hs_14i[1,6] = -cons_1 #wijR theta_j
+            Hs_14i[6,1] = -cons_1 #wijR theta_j 
+    
+            Hs_14i[2,5] = cons_2 #wijR theta_i
+            Hs_14i[5,2] = cons_2 #wijR theta_i
+            Hs_14i[2,6] = -cons_2 #wijR theta_j
+            Hs_14i[6,2] = -cons_2 #wijR theta_j 
+    
+            Hs_14i[5,5] = cons_3 #thetai thetai
+            Hs_14i[6,6] = cons_3 #thetaj thetaj
+            Hs_14i[5,6] = -cons_3 #thetai thetaj
+            Hs_14i[6,5] = -cons_3 #thetaj thetai 
+            
+            supY = [data.YftR[l] data.YftI[l] data.YffR[l] 0 0 0;
+                -data.YftI[l] data.YftR[l] -data.YffI[l] 0 0 0;
+                data.YtfR[l] -data.YtfI[l] 0 data.YttR[l] 0 0;
+                -data.YtfI[l] -data.YtfR[l] 0 -data.YttI[l] 0 0]
+            Hs_14j = -2*pi_14[3,l]*(supY[1,:]*transpose(supY[1,:]) + supY[2,:]*transpose(supY[2,:]) )
+            Hs_14k = -2*pi_14[4,l]*(supY[3,:]*transpose(supY[3,:]) + supY[4,:]*transpose(supY[4,:]) )
+            Hs .= Hs_14h + Hs_14i + Hs_14j + Hs_14k + UniformScaling(4)
+            mod1.Hs[6*(l-1)+1:6*l,1:6] .= Hs
+    
+            is_Hs_sym[l] = maximum(abs.(Hs - transpose(Hs)))
+            @assert is_Hs_sym[l] <= 1e-6
+            eival, eivec = eigen(Hs)
+            is_Hs_PSD[l] = minimum(eival)
+            @assert is_Hs_PSD[l] >= 0.0
+    
+            #inherit structure of Linear Constraint (overleaf): ignore 1h and 1i with zero assignment in ipopt benchmark
+            LH_1h = [2*(line_var[1,l]), 2*(line_var[2,l]), -(line_var[4,l]), -(line_var[3,l])] #LH * x = RH
+            mod1.LH_1h[l,:] .= LH_1h
+            RH_1h = -((line_var)[1,l])^2 - ((line_var)[2,l])^2 + (line_var)[3,l]*(line_var)[4,l] 
+            mod1.RH_1h[l] = RH_1h
+    
+            LH_1i = [sin((line_var)[5,l] - (line_var)[6,l]), -cos((line_var)[5,l] - (line_var)[6,l]), 
+            (line_var)[1,l]*cos((line_var)[5,l] - (line_var)[6,l]) +  (line_var)[2,l]*sin((line_var)[5,l] - (line_var)[6,l]),
+            -(line_var)[1,l]*cos((line_var)[5,l] - (line_var)[6,l]) -  (line_var)[2,l]*sin((line_var)[5,l] - (line_var)[6,l])] #Lf * x = RH
+            mod1.LH_1i[l,:] .= LH_1i
+            RH_1i = -(line_var)[1,l]*sin((line_var)[5,l] - (line_var)[6,l])  +  (line_var)[2,l]*cos((line_var)[5,l] - (line_var)[6,l])
+            mod1.RH_1i[l] = RH_1i
+    
+            #inherit structure line limit constraint (overleaf)
+            LH_1j = [2*(line_fl)[1,l], 2*(line_fl)[2,l]] #rand(2)
+            mod1.LH_1j[l,:] .= LH_1j
+            RH_1j = -(((line_fl)[1,l])^2 + ((line_fl)[2,l])^2 - data.rateA[l]) 
+            mod1.RH_1j[l] = RH_1j
+    
+            LH_1k = [2*(line_fl)[3,l], 2*(line_fl)[4,l]] #zeros(2) #rand(2)
+            mod1.LH_1k[l,:] .= LH_1k
+            RH_1k = -(((line_fl)[3,l])^2 + ((line_fl)[4,l])^2 - data.rateA[l]) 
+            mod1.RH_1k[l] = RH_1k
+        end
+    end #inbound  
+
+    # initialize type
+    if use_gpu
+        TD = CuArray{Float64,1}; TI = CuArray{Int,1}; TM = CuArray{Float64,2}
+    end
+
+    env2 = ExaAdmm.AdmmEnv{T,TD,TI,TM}(case, rho_pq, rho_va; use_gpu = true, verbose=verbose)
+    mod2 = ExaAdmm.ModelQpsub{T,TD,TI,TM}(env2)
+    sol2 = mod2.solution
+    par2 = env2.params
+    data2 = mod2.grid_data
+    info2 = mod2.info
+    
+    env2.params.scale = 1e-4
+    env2.params.initial_beta = 1e3
+    env2.params.beta = 1e3
+    
+    mod2.Hs = copy(mod1.Hs)
+    mod2.LH_1h = copy(mod1.LH_1h)
+    mod2.RH_1h = copy(mod1.RH_1h)
+    mod2.LH_1i = copy(mod1.LH_1i)
+    mod2.RH_1i = copy(mod1.RH_1i)
+    mod2.LH_1j = copy(mod1.LH_1j)
+    mod2.RH_1j = copy(mod1.RH_1j)
+    mod2.LH_1k = copy(mod1.LH_1k)
+    mod2.RH_1k = copy(mod1.RH_1k)
+    mod2.ls = copy(mod1.ls)
+    mod2.us = copy(mod1.us)
+    
+    mod2.qpsub_pgmax = copy(mod1.qpsub_pgmax)
+    mod2.qpsub_pgmin = copy(mod1.qpsub_pgmin)
+    mod2.qpsub_qgmax = copy(mod1.qpsub_qgmax)
+    mod2.qpsub_qgmin = copy(mod1.qpsub_qgmin)
+
+    mod2.qpsub_c1 = copy(mod1.qpsub_c1)
+    mod2.qpsub_c2 = copy(mod1.qpsub_c2)
+    mod2.qpsub_Pd = copy(mod1.qpsub_Pd)
+    mod2.qpsub_Qd = copy(mod1.qpsub_Qd)
+
+    ExaAdmm.init_solution!(mod2, mod2.solution, env2.initial_rho_pq, env2.initial_rho_va)
+    
+    info2.norm_z_prev = info2.norm_z_curr = 0
+    par2.initial_beta = 0 
+    par2.beta = 0
+    sol2.lz .= 0
+    sol2.z_curr .= 0
+    sol2.z_prev .= 0
+    par2.inner_iterlim = 1
+    par2.shmem_size = sizeof(Float64)*(16*mod2.n+4*mod2.n^2+178) + sizeof(Int)*(4*mod2.n)
+
+
+    ExaAdmm.admm_increment_outer(env2, mod2)
+    ExaAdmm.admm_increment_reset_inner(env2, mod2)
+    ExaAdmm.admm_increment_inner(env2, mod2)
+
+
+  
+    ExaAdmm.admm_update_x(env2, mod2)
+    U_SOL_CPU = [-0.229424, -0.1339605, -0.0813327, -0.0049391, -0.0465958, 0.2252203, -0.1236772, -0.1271249, 0.1236772, 0.1189746, -0.1182455, -0.1040701, -0.0, 0.0022044, -0.0630247, -0.1092983, 0.0622891, 0.1082608, -0.0297814, -0.0074735, 0.0422504, 0.0451598, 0.0984247, 0.0700557, -0.102193, -0.0905333, 0.0294319, -0.0067947, 0.0250279, 0.0125998, -0.1368894, 0.2542204, 0.1368894, -0.2698603, -0.0770438, -0.1077549, -0.0375397, -0.0344878, -0.0665263, -0.1146067, 0.0658612, 0.1071325, -0.0032826, 0.0208988, -0.0055371, -0.0008479, 0.1049601, 0.1480269, -0.1061072, -0.1593811, 0.0230133, -0.0010432, -0.0026878, -0.0082759, 0.2045771, -0.05922, -0.2045771, 0.0340703, -0.0553384, -0.0495078, -0.0467405, -0.0542589, -0.1322638, -0.1856806, 0.1264451, 0.1533923, -0.0223818, 0.0420754, 0.0141644, 0.0280826, 0.0937455, 0.2743344, -0.0957246, -0.2938647, 0.0406687, -0.0099012, 0.0507601, 0.0458481]
+    @test norm(U_SOL_CPU - Array(sol2.u_curr),  Inf) <= atol 
+
+   
+    ExaAdmm.admm_update_xbar(env2, mod2)
+    V_SOL_CPU = [-0.1765506, -0.1305427, -0.1429549, 0.0145656, -0.0917426, 0.2397204, -0.1765506, -0.1305427, 0.1353679, 0.2137041, -0.1182455, -0.0479176, -0.0, 0.030101, -0.051334, -0.0145688, -0.0180678, 0.0191025, -0.0479176, 0.0109792, 0.030101, 0.0350939, 0.0180678, -0.0191025, -0.091583, 0.0678001, 0.0109792, -0.0392774, 0.0350939, -0.0091417, -0.0917426, 0.2397204, 0.1474993, -0.1115269, -0.0770438, -0.0392774, -0.0375397, -0.0091417, -0.0559163, 0.0437267, -0.0195494, -0.0204472, -0.0392774, 0.0219561, -0.0091417, -0.0017678, 0.0195494, 0.0204472, -0.0948426, -0.0246205, 0.0219561, -0.0262545, -0.0017678, -0.0136173, 0.2158417, 0.0755405, -0.1429549, 0.0145656, -0.0262545, -0.0495078, -0.0136173, -0.0542589, -0.1209991, -0.05092, 0.0163498, -0.0604711, -0.0262545, 0.041372, -0.0136173, 0.0394213, -0.0163498, 0.0604711, -0.0840339, -0.1991353, 0.041372, -0.0479176, 0.0394213, 0.030101]
+    @test norm(V_SOL_CPU - Array(sol2.v_curr),  Inf) <= atol 
+    
+    
+   
+    ExaAdmm.admm_update_l_single(env2, mod2)
+    L_SOL_CPU = [-1.057468, -0.0683565, 1.2324431, -0.390094, 0.9029359, -0.2900002, 1.057468, 0.0683565, -0.2338139, -1.8945894, 0.0, -1.1230512, 0.0, -0.5579311, -0.2338139, -1.8945894, 1.6071386, 1.7831649, 0.362723, -0.3690544, 0.2429884, 0.201319, 1.6071386, 1.7831649, -0.2121989, -3.166669, 0.3690544, 0.6496544, -0.201319, 0.4348308, -0.9029359, 0.2900002, -0.2121989, -3.166669, 0.0, -1.3695503, 0.0, -0.5069225, -0.2121989, -3.166669, 1.7082129, 2.5515935, 0.7198958, -0.0211449, 0.0720918, 0.0183997, 1.7082129, 2.5515935, -0.2252933, -2.6952112, 0.0211449, 0.5042264, -0.0183997, 0.1068284, -0.2252933, -2.6952112, -1.2324431, 0.390094, -0.5816791, 0.0, -0.6624625, 0.0, -0.2252933, -2.6952112, 2.2019063, 4.277267, 0.0774527, 0.0140663, 0.5556341, -0.2267749, 2.2019063, 4.277267, -0.2338139, -1.8945894, -0.0140663, 0.7603282, 0.2267749, 0.3149427]
+    @test norm(L_SOL_CPU - Array(sol2.l_curr),  Inf) <= atol 
+
+
+
+    
+    ExaAdmm.admm_update_residual(env2, mod2)
+    RD_SOL_CPU = [-12.7113319, 0.0683565, -6.3497307, 0.390094, -10.4678211, 0.2900002, -3.5310124, 66.8335914, 2.7073584, 69.0505445, 1.6350909, 2.7727729, 0.0, -0.2033556, 6.2198773, 38.6305246, 6.0542367, 34.8407717, 2.7727729, 3.5228599, -0.2033556, -0.5905927, 4.5961859, 17.4861766, 3.2963767, 22.992984, 3.5228599, 3.2144525, -0.5905927, 0.2065699, -1.8348521, 63.0638614, 2.9499869, 66.0288516, 1.8737146, 3.2144525, 0.5884628, 0.2065699, 3.5020235, 39.5936185, 3.6977135, 33.8548619, 3.2144525, 3.9788551, 0.2065699, -0.2715214, 6.1151706, 48.632426, 4.5716386, 54.0015074, 3.9788551, 3.4749108, -0.2715214, 0.2269477, 4.3168345, 65.5108139, -2.8590981, 62.5254432, 3.4749108, 2.8994775, 0.2269477, 0.8395123, 2.3304348, 22.2701389, 3.596083, 14.8170113, 3.4749108, 3.5801139, 0.2269477, -0.7247947, 3.4309188, 32.9094664, 3.4130061, 38.9852777, 3.5801139, 2.7727729, -0.7247947, -0.2033556]
+    RP_SOL_CPU = [-0.0528734, -0.0034178, 0.0616222, -0.0195047, 0.0451468, -0.0145, 0.0528734, 0.0034178, -0.0116907, -0.0947295, 0.0, -0.0561526, 0.0, -0.0278966, -0.0116907, -0.0947295, 0.0803569, 0.0891582, 0.0181362, -0.0184527, 0.0121494, 0.0100659, 0.0803569, 0.0891582, -0.0106099, -0.1583335, 0.0184527, 0.0324827, -0.0100659, 0.0217415, -0.0451468, 0.0145, -0.0106099, -0.1583335, 0.0, -0.0684775, 0.0, -0.0253461, -0.0106099, -0.1583335, 0.0854106, 0.1275797, 0.0359948, -0.0010572, 0.0036046, 0.00092, 0.0854106, 0.1275797, -0.0112647, -0.1347606, 0.0010572, 0.0252113, -0.00092, 0.0053414, -0.0112647, -0.1347606, -0.0616222, 0.0195047, -0.029084, 0.0, -0.0331231, 0.0, -0.0112647, -0.1347606, 0.1100953, 0.2138633, 0.0038726, 0.0007033, 0.0277817, -0.0113387, 0.1100953, 0.2138633, -0.0116907, -0.0947295, -0.0007033, 0.0380164, 0.0113387, 0.0157471]
+    @test norm(RD_SOL_CPU - Array(sol2.rd),  Inf) <= atol 
+    @test norm(RP_SOL_CPU - Array(sol2.rp),  Inf) <= atol 
+end
+
+
+@testset "Qpsub ADMM (GPU) vs ADMM (CPU)" begin
+    env3, mod3 = ExaAdmm.solve_qpsub(case, mod1.Hs, mod1.LH_1h, mod1.RH_1h,
+    mod1.LH_1i, mod1.RH_1i, mod1.LH_1j, mod1.RH_1j, mod1.LH_1k, mod1.RH_1k, mod1.ls, mod1.us, mod1.qpsub_pgmax, mod1.qpsub_pgmin, mod1.qpsub_qgmax, mod1.qpsub_qgmin, mod1.qpsub_c1, mod1.qpsub_c2, mod1.qpsub_Pd, mod1.qpsub_Qd,
+    initial_beta; 
+        outer_iterlim=10000, inner_iterlim=1, scale = 1e-4, obj_scale = 1, rho_pq = 4000.0, rho_va = 4000.0, verbose=0, outer_eps=2*1e-6, onelevel = true, use_gpu = true)
+
+  
+    @test mod3.info.status == :Solved
+    @test mod3.info.outer == 5107
+    @test mod3.info.cumul == 5107
+    @test isapprox(mod3.info.objval, -21.92744641968529; atol=1e-6)    
+    
+    sol3 = mod3.solution
+    atol = 1e-6
+
+    dpg_sol_cpu = [-0.11550455128840183
+    0.06549995591228526
+    0.053676513010466456]
+
+    dqg_sol_cpu = [-0.007903958357205233
+    0.019785362536583685
+    0.011746408694041054]
+
+    dline_var_cpu = [-0.00630369   -0.00556137  -0.00333908   0.000404039   9.04974e-6   -8.98924e-6    0.000661247  -0.00373931  -0.00567469
+    -0.00665306   -0.00509225  -0.00938981   0.00314544   -0.000313391  -0.000223849  -0.00409375    0.00970327   0.00512255
+    -0.00675896   -0.0063758   -0.00500354   0.00109238   -1.06479e-8   -1.48199e-6   -1.06804e-8   -1.06804e-8  -0.00534789
+    -0.00637581   -0.00500357  -1.06479e-8  -1.06479e-8   -1.48608e-6   -1.06804e-8    0.00189783   -0.00534788  -0.0063758
+    -3.25492e-19   0.00531377   0.00949253   0.0201868     0.0175742     0.0178357     0.0180226     0.0180226    0.00951113
+     0.00531381    0.00949261   0.0175743    0.0175742     0.0178358     0.0180226     0.021366      0.00951121   0.00531385]
+
+     dline_fl_cpu = [ -0.115505    -0.0551048   -0.0546064    0.0536765   -0.00307677   -0.00305414   -0.0655      0.0624192    0.0598887
+     -0.00790396   0.00183362   0.00363203   0.0117464    0.000273346   0.000464936  -0.0105801   0.0108191   -0.00273044
+      0.115505     0.0546064    0.0567533   -0.0536765    0.00305414    0.0030808     0.0655     -0.0598887   -0.0603997
+     -0.00125214  -0.00363203   0.00662172  -0.00689505  -0.000464924  -0.000239002   0.0197854   0.00273044  -0.000581486]
+
+     dtheta_sol_cpu = [-3.25491739361945e-19
+     0.021365998519708898
+     0.020186769395364127
+     0.005313812154732135
+     0.009492570006911722
+     0.017574220141571076
+     0.017835751182986154
+     0.01802260774972062
+     0.009511171536802466]
+
+     dw_sol_cpu = [-0.006758956828784941
+     0.0018978329222831053
+     0.0010923793242977361
+    -0.006375802955645012
+    -0.005003557142581212
+    -1.0647877468628053e-8
+    -1.484035364757708e-6
+    -1.068039123808262e-8
+    -0.0053478835040742636]
+
+    dual_infeas_1_cpu = [-254.11001283448402
+    111.34992505088496
+    131.50745687564282
+      0.31062229996245705
+     -8.03424172781389
+     -0.3078548536034032
+     -0.07573915941618026
+      0.012603427412195068
+      0.008651826283358158
+      0.03279612499623894] 
+
+    dual_infeas_2_cpu = [-2.337593793742112
+    -0.2062996739647522
+     0.10994572761041102
+     0.031770554831216935
+     0.02745497289931917
+    -0.11873053563714611
+    -1.2564033339874354
+    -0.12073294817351554
+     0.2121498763329952
+     0.05086202591219009]
+
+    dual_infeas_3_cpu = [0.057405134084524315
+    -0.16737701814667477
+     3.670092553395319
+     0.4052710265115234
+    -0.2342336895471848
+     0.07724866682808983
+     0.07379529136930803
+     0.004080775509334406
+    -0.12229701235980252
+    -0.0018175498192630844]
+
+    dual_infeas_4_cpu = [-0.0019510899995607155
+    0.07059765612753019
+    0.07104222896868044
+   -0.0061070561232362
+   -0.17080362240460334
+    0.0027843138173676155
+    0.003450443341928419
+    0.07157328724238488
+    0.07186014778306046
+   -0.28991023775276664]
+
+    dual_infeas_5_cpu = [-4.200422692029046
+    -0.33666635407314044
+     0.6407231013041463
+     0.0753794333813387
+     0.08217499064640721
+    -0.18387848500073856
+     1.4609510208883514
+     0.26889163927860954
+    -0.13184568446925882
+     0.065799538338272
+     0.04433557788772256
+     0.06271507036787678
+     2.807937471979002
+     0.0728632119983576
+    -0.20915786828282304
+     0.03875039932653859
+     0.020549534704044203]
+
+     dual_infeas_cpu = vcat(dual_infeas_1_cpu,dual_infeas_2_cpu,dual_infeas_3_cpu,dual_infeas_4_cpu,dual_infeas_5_cpu)
+
+     lambda_cpu = [0.488894     435.795        282.155     -0.0489912    248.787        348.203         0.135763     267.164        315.305
+     0.790211       0.88808        0.816501  -0.0653646      0.576725       0.456339      0.0687841      0.210784       0.140033
+    -8.67362e-19   -8.67362e-19   -0.0       -0.0           -4.33681e-19   -1.73472e-18  -0.0           -8.67362e-19   -8.67362e-19
+    -2.60209e-18   -0.0           -0.0       -1.73472e-18   -8.67362e-19   -8.67362e-19  -8.67362e-19   -0.0           -8.67362e-19]
+
+     @test norm(dpg_sol_cpu - Array(mod3.dpg_sol),  Inf) <= atol 
+     @test norm(dqg_sol_cpu - Array(mod3.dqg_sol),  Inf) <= atol 
+     @test norm(dline_var_cpu - Array(mod3.dline_var),  Inf) <= atol 
+     @test norm(dline_fl_cpu - Array(mod3.dline_fl),  Inf) <= atol 
+     @test norm(dtheta_sol_cpu - Array(mod3.dtheta_sol),  Inf) <= atol 
+     @test norm(dw_sol_cpu - Array(mod3.dw_sol),  Inf) <= atol 
+     @test norm(dual_infeas_cpu - Array(mod3.dual_infeas),  Inf) <= atol
+     @test norm((lambda_cpu - Array(mod3.lambda))./max.(abs.(lambda_cpu),1), Inf) <= 2e-6 
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 858ef53..8f0c818 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,7 +1,6 @@
 using Test
 using LinearAlgebra
 using Printf
-using CUDA
 
 using ExaAdmm
 using LazyArtifacts
@@ -10,12 +9,14 @@ using LazyArtifacts
 const INSTANCES_DIR = joinpath(artifact"ExaData", "ExaData")
 const MP_DEMAND_DIR = joinpath(INSTANCES_DIR, "mp_demand")
 
+
 init_time = time()
 
 @testset "Testing ExaAdmm" begin
     @testset "Testing ADMM algorithms on CPUs" begin
         include("algorithms/acopf_update_cpu.jl")
         include("algorithms/mpacopf_update_cpu.jl")
+        include("algorithms/qpsub_update_cpu.jl")
     end
 
     using CUDA
@@ -23,6 +24,7 @@ init_time = time()
         @testset "Testing ADMM algorithms using CUDA.jl" begin
             include("algorithms/acopf_update_gpu.jl")
             include("algorithms/mpacopf_update_gpu.jl")
+            include("algorithms/qpsub_update_gpu.jl")
         end
     end