Add support for KA for GPUs (#34)

exanauts · Jun 27, 2022 · 18dcaee · 18dcaee
1 parent cec89e6
commit 18dcaee
Show file tree

Hide file tree

Showing 69 changed files with 2,246 additions and 126 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,13 +1,15 @@
 name = "ExaAdmm"
 uuid = "4d6a948c-1075-4240-a564-361a5d4e22a2"
 authors = ["Youngdae Kim <[email protected]>", "Kibaek Kim <[email protected]>", "Weiqi Zhang <[email protected]>", "François Pacaud <[email protected]>", "Michel Schanen <[email protected]>"]
-version = "0.1.3"
+version = "0.2.0"
 
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 ExaTron = "28b18bf8-76f9-41ea-81fa-0f922810b349"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -16,8 +18,10 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
+AMDGPU = "0.3"
 CUDA = "3.4"
-ExaTron = "1"
+ExaTron = "2"
 FileIO = "1.14"
-julia = "1.7"
+KernelAbstractions = "0.8"
 MPI = "0.19"
+julia = "1.7"
diff --git a/README.md b/README.md
@@ -8,29 +8,52 @@ ExaAdmm.jl implements the two-level alternating direction method of multipliers
 The package can be installed in the Julia REPL with the command below:
 
 ```julia
-] ExaAdmm
+] add ExaAdmm
 ```
 
-Running the algorithms on GPU requires Nvidia GPUs with `CUDA.jl`. 
+Running the algorithms on the GPU requires either NVIDIA GPUs with [`CUDA.jl`](https://github.com/JuliaGPU/CUDA.jl) or [`KernelAbstractions.jl`](https://github.com/JuliaGPU/KernelAbstractions.jl) (KA) with the respective device support (e.g., [`AMDGPU.jl`](https://github.com/JuliaGPU/AMDGPU.jl) and `ROCKernels.jl`). Currently, only the ACOPF problem is supported using KA.
 
 ## How to run
 
 Currently, `ExaAdmm.jl` supports electrical grid files in the MATLAB format. You can download them from [here](https://github.com/MATPOWER/matpower).
-Below shows an example of solving `case1354pegase.m` using `ExaAdmm.jl` on GPUs.
+Below shows an example of solving `case1354pegase.m` using `ExaAdmm.jl` on an NVIDIA GPU
 
 ```julia
-env, mod = ExaAdmm.solve_acopf(
-    "case1354pegase.m"; 
-    rho_pq=1e1, 
-    rho_va=1e3, 
-    outer_iterlim=20, 
-    inner_iterlim=20, 
-    scale=1e-4, 
-    tight_factor=0.99, 
-    use_gpu=true
+using ExaAdmm
+
+env, mod = solve_acopf(
+    "case1354pegase.m";
+    rho_pq=1e1,
+    rho_va=1e3,
+    outer_iterlim=20,
+    inner_iterlim=20,
+    scale=1e-4,
+    tight_factor=0.99,
+    use_gpu=true,
+    verbose=1
 );
 ```
-
+and the same example on an AMD GPU:
+```julia
+using ExaAdmm
+using AMDGPU
+using ROCKernels
+
+ExaAdmm.KAArray{T}(n::Int, ::ROCDevice) where {T} = ROCArray{T}(undef, n)
+
+env, mod = solve_acopf(
+    "case1354pegase.m";
+    rho_pq=1e1,
+    rho_va=1e3,
+    outer_iterlim=20,
+    inner_iterlim=20,
+    scale=1e-4,
+    tight_factor=0.99,
+    use_gpu=true,
+    ka_device = ROCDevice(),
+    verbose=1
+)
+```
 The following table shows parameter values we used for solving pegase and ACTIVSg data.
 
 Data        | rho_pq | rho_va | scale | obj_scale
@@ -49,7 +72,7 @@ We have used the same `tight_factor=0.99`, `outer_iterlim=20`, and `inner_iterli
 - Youngdae Kim and Kibaek Kim. "Accelerated Computation and Tracking of AC Optimal Power Flow Solutions using GPUs" arXiv preprint arXiv:2110.06879, 2021
 - Youngdae Kim, François Pacaud, Kibaek Kim, and Mihai Anitescu. "Leveraging GPU batching for scalable nonlinear programming through massive lagrangian decomposition" arXiv preprint arXiv:2106.14995, 2021
 
-## Acknowledgements
+## Acknowledgments
 
 This research was supported by the Exascale ComputingProject (17-SC-20-SC),  a collaborative effort of the U.S. Department of Energy Office of Science and the National Nuclear Security Administration.
 This material is based upon work supported by the U.S. Department of Energy, Office of Science, under contract number DE-AC02-06CH11357.

diff --git a/src/ExaAdmm.jl b/src/ExaAdmm.jl
@@ -7,15 +7,24 @@ using LinearAlgebra
 using SparseArrays
 using MPI
 using CUDA
+import AMDGPU: ROCArray, has_rocm_gpu
+using KernelAbstractions
 using ExaTron
 using Random
 
+const KA = KernelAbstractions
+
+export solve_acopf
+
+struct KAArray{T} end
+
 include("utils/parse_matpower.jl")
 include("utils/opfdata.jl")
 include("utils/environment.jl")
 include("utils/grid_data.jl")
 include("utils/print_statistics.jl")
 include("utils/utilities_gpu.jl")
+include("utils/utilities_ka.jl")
 
 include("algorithms/admm_two_level.jl")
 
@@ -44,7 +53,7 @@ include("models/acopf/acopf_admm_update_residual_cpu.jl")
 include("models/acopf/acopf_admm_update_lz_cpu.jl")
 include("models/acopf/acopf_admm_prepoststep_cpu.jl")
 
-# GPU specific implementation
+# CUDA specific implementation
 include("models/acopf/acopf_init_solution_gpu.jl")
 include("models/acopf/acopf_generator_kernel_gpu.jl")
 include("models/acopf/acopf_eval_linelimit_kernel_gpu.jl")
@@ -59,9 +68,25 @@ include("models/acopf/acopf_admm_update_residual_gpu.jl")
 include("models/acopf/acopf_admm_update_lz_gpu.jl")
 include("models/acopf/acopf_admm_prepoststep_gpu.jl")
 
+# KA specific implementation
+include("models/acopf/acopf_init_solution_ka.jl")
+include("models/acopf/acopf_generator_kernel_ka.jl")
+include("models/acopf/acopf_eval_linelimit_kernel_ka.jl")
+include("models/acopf/acopf_tron_linelimit_kernel_ka.jl")
+include("models/acopf/acopf_auglag_linelimit_kernel_ka.jl")
+include("models/acopf/acopf_bus_kernel_ka.jl")
+include("models/acopf/acopf_admm_update_x_ka.jl")
+include("models/acopf/acopf_admm_update_xbar_ka.jl")
+include("models/acopf/acopf_admm_update_z_ka.jl")
+include("models/acopf/acopf_admm_update_l_ka.jl")
+include("models/acopf/acopf_admm_update_residual_ka.jl")
+include("models/acopf/acopf_admm_update_lz_ka.jl")
+include("models/acopf/acopf_admm_prepoststep_ka.jl")
+
 # Rolling horizon
 include("models/acopf/acopf_admm_rolling_cpu.jl")
 include("models/acopf/acopf_admm_rolling_gpu.jl")
+include("models/acopf/acopf_admm_rolling_ka.jl")
 
 # ----------------------------------------
 # Multi-period ACOPF implementation
@@ -133,5 +158,4 @@ include("models/mpec/mpec_admm_update_residual_gpu.jl")
 include("models/mpec/mpec_admm_update_lz_gpu.jl")
 include("models/mpec/mpec_admm_prepoststep_gpu.jl")
 =#
-
 end # module
diff --git a/src/algorithms/admm_two_level.jl b/src/algorithms/admm_two_level.jl
@@ -1,5 +1,5 @@
 function admm_two_level(
-    env::AdmmEnv, mod::AbstractOPFModel
+    env::AdmmEnv, mod::AbstractOPFModel, device::Union{Nothing,KA.GPU}=nothing
 )
     par = env.params
     info = mod.info
@@ -13,7 +13,7 @@ function admm_two_level(
     par.beta = par.initial_beta
 
     if par.verbose > 0
-        admm_update_residual(env, mod)
+        admm_update_residual(env, mod, device)
         @printf("%8s  %8s  %10s  %10s  %10s  %10s  %10s  %10s  %10s  %10s  %10s\n",
                 "Outer", "Inner", "Objval", "AugLag", "PrimRes", "EpsPrimRes",
                 "DualRes", "||z||", "||Ax+By||", "OuterTol", "Beta")
@@ -27,19 +27,19 @@ function admm_two_level(
 
     overall_time = @timed begin
     while info.outer < par.outer_iterlim
-        admm_increment_outer(env, mod)
-        admm_outer_prestep(env, mod)
+        admm_increment_outer(env, mod, device)
+        admm_outer_prestep(env, mod, device)
 
-        admm_increment_reset_inner(env, mod)
+        admm_increment_reset_inner(env, mod, device)
         while info.inner < par.inner_iterlim
             admm_increment_inner(env, mod)
-            admm_inner_prestep(env, mod)
+            admm_inner_prestep(env, mod, device)
 
-            admm_update_x(env, mod)
-            admm_update_xbar(env, mod)
-            admm_update_z(env, mod)
-            admm_update_l(env, mod)
-            admm_update_residual(env, mod)
+            admm_update_x(env, mod, device)
+            admm_update_xbar(env, mod, device)
+            admm_update_z(env, mod, device)
+            admm_update_l(env, mod, device)
+            admm_update_residual(env, mod, device)
 
             info.eps_pri = sqrt_d / (2500*info.outer)
 
@@ -65,7 +65,7 @@ function admm_two_level(
             break
         end
 
-        admm_update_lz(env, mod)
+        admm_update_lz(env, mod, device)
 
         if info.norm_z_curr > par.theta*info.norm_z_prev
             par.beta = min(par.inc_c*par.beta, 1e24)
@@ -74,7 +74,7 @@ function admm_two_level(
     end # @timed
 
     info.time_overall = overall_time.time
-    admm_poststep(env, mod)
+    admm_poststep(env, mod, device)
 
     if par.verbose > 0
         print_statistics(env, mod)

diff --git a/src/interface/solve_acopf.jl b/src/interface/solve_acopf.jl
@@ -2,17 +2,33 @@ function solve_acopf(case::String;
     case_format="matpower",
     outer_iterlim=20, inner_iterlim=1000, rho_pq=400.0, rho_va=40000.0,
     obj_scale=1.0, scale=1e-4, storage_ratio=0.0, storage_charge_max=1.0,
-    use_gpu=false, use_linelimit=true, use_projection=false, tight_factor=1.0,
+    use_gpu=false, ka_device=nothing, use_linelimit=true, use_projection=false, tight_factor=1.0,
     outer_eps=2*1e-4, gpu_no=0, verbose=1
 )
-    T = Float64; TD = Array{Float64,1}; TI = Array{Int,1}; TM = Array{Float64,2}
-    if use_gpu
+    T = Float64
+    # 1. ka_device = nothing and use_gpu = false, CPU version of the code is used
+    # 2. ka_device = KA.CPU() and use_gpu = false, CPU version of the code is used, NOT the KA.CPU kernels
+    #    due to nested kernels limitations and no added benefit
+    # 3. ka_device = nothing and use_gpu = true, use original CUDA.jl kernels
+    # 4. ka_device is a KA.GPU and use_gpu = true, use KA kernels
+    if !use_gpu && (isa(ka_device, Nothing) || isa(ka_device, KA.CPU))
+        TD = Array{Float64,1}; TI = Array{Int,1}; TM = Array{Float64,2}
+        ka_device = nothing
+    elseif use_gpu && isa(ka_device, Nothing)
         CUDA.device!(gpu_no)
         TD = CuArray{Float64,1}; TI = CuArray{Int,1}; TM = CuArray{Float64,2}
+    elseif use_gpu && isa(ka_device, KA.Device)
+        if has_cuda_gpu()
+            TD = CuArray{Float64,1}; TI = CuArray{Int,1}; TM = CuArray{Float64,2}
+        elseif has_rocm_gpu()
+            TD = ROCArray{Float64,1}; TI = ROCArray{Int,1}; TM = ROCArray{Float64,2}
+        end
+    else
+        error("Inconsistent device selection use_gpu=$use_gpu and ka_device=$(typepof(ka_device))")
     end
 
     env = AdmmEnv{T,TD,TI,TM}(case, rho_pq, rho_va; case_format=case_format,
-            use_gpu=use_gpu, use_linelimit=use_linelimit,
+            use_gpu=use_gpu, ka_device=ka_device, use_linelimit=use_linelimit,
             use_projection=use_projection, tight_factor=tight_factor, gpu_no=gpu_no,
             storage_ratio=storage_ratio, storage_charge_max=storage_charge_max,
             verbose=verbose)
@@ -24,7 +40,7 @@ function solve_acopf(case::String;
     env.params.outer_iterlim = outer_iterlim
     env.params.inner_iterlim = inner_iterlim
 
-    admm_two_level(env, mod)
+    admm_two_level(env, mod, isa(ka_device, KA.CPU) ? nothing : ka_device)
 
     return env, mod
 end
diff --git a/src/models/acopf/acopf_admm_increment.jl b/src/models/acopf/acopf_admm_increment.jl
@@ -3,7 +3,8 @@ Increment outer iteration counter by one.
 """
 function admm_increment_outer(
     env::AdmmEnv,
-    mod::AbstractOPFModel
+    mod::AbstractOPFModel,
+    device=nothing
 )
     mod.info.outer += 1
     return
@@ -14,7 +15,8 @@ Reset inner iteration counter to zero.
 """
 function admm_increment_reset_inner(
     env::AdmmEnv,
-    mod::AbstractOPFModel
+    mod::AbstractOPFModel,
+    device=nothing
 )
     mod.info.inner = 0
     return
@@ -25,7 +27,8 @@ Increment inner iteration counter by one.
 """
 function admm_increment_inner(
     env::AdmmEnv,
-    mod::AbstractOPFModel
+    mod::AbstractOPFModel,
+    device=nothing
 )
     mod.info.inner += 1
     mod.info.cumul += 1

diff --git a/src/models/acopf/acopf_admm_prepoststep_cpu.jl b/src/models/acopf/acopf_admm_prepoststep_cpu.jl
@@ -3,7 +3,8 @@ Implement any algorithmic steps required before each outer iteration.
 """
 function admm_outer_prestep(
     env::AdmmEnv{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
-    mod::AbstractOPFModel{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}}
+    mod::AbstractOPFModel{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
+    device::Nothing=nothing
 )
     sol, info = mod.solution, mod.info
     info.norm_z_prev = norm(sol.z_curr)
@@ -15,7 +16,8 @@ Implement any algorithmic steps required before each inner iteration.
 """
 function admm_inner_prestep(
     env::AdmmEnv{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
-    mod::AbstractOPFModel{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}}
+    mod::AbstractOPFModel{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
+    device::Nothing=nothing
 )
     sol = mod.solution
     sol.z_prev .= sol.z_curr
@@ -27,7 +29,8 @@ Implement any steps required after the algorithm terminates.
 """
 function admm_poststep(
     env::AdmmEnv{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
-    mod::AbstractOPFModel{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}}
+    mod::AbstractOPFModel{Float64,Array{Float64,1},Array{Int,1},Array{Float64,2}},
+    device::Nothing=nothing
 )
     data, sol, info, grid_data = env.data, mod.solution, mod.info, mod.grid_data
 

diff --git a/src/models/acopf/acopf_admm_prepoststep_gpu.jl b/src/models/acopf/acopf_admm_prepoststep_gpu.jl
@@ -1,6 +1,7 @@
 function admm_outer_prestep(
     env::AdmmEnv{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
-    mod::AbstractOPFModel{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}}
+    mod::AbstractOPFModel{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
+    device::Nothing=nothing
 )
     sol, info = mod.solution, mod.info
     info.norm_z_prev = CUDA.norm(sol.z_curr)
@@ -9,7 +10,8 @@ end
 
 function admm_inner_prestep(
     env::AdmmEnv{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
-    mod::AbstractOPFModel{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}}
+    mod::AbstractOPFModel{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
+    device::Nothing=nothing
 )
     sol = mod.solution
     @cuda threads=64 blocks=(div(mod.nvar-1, 64)+1) copy_data_kernel(mod.nvar, sol.z_prev, sol.z_curr)
@@ -20,7 +22,8 @@ end
 
 function admm_poststep(
     env::AdmmEnv{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
-    mod::AbstractOPFModel{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}}
+    mod::AbstractOPFModel{Float64,CuArray{Float64,1},CuArray{Int,1},CuArray{Float64,2}},
+    device::Nothing=nothing
 )
     data, sol, info, grid_data = env.data, mod.solution, mod.info, mod.grid_data