Skip to content

Commit

Permalink
Merge pull request #1230 from jim19930609/paddle
Browse files Browse the repository at this point in the history
Revert "Force env_mat force_se_a virial_se_a to fallback on CPU"
  • Loading branch information
amcadmus authored Oct 25, 2021
2 parents d55286d + 45a2962 commit e5aeb25
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 120 deletions.
17 changes: 15 additions & 2 deletions deepmd/train/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import os
import time
import shutil
import copy
import gc
import numpy as np
from deepmd.env import tf, paddle
from deepmd.env import default_tf_session_config
Expand Down Expand Up @@ -79,6 +81,7 @@ class DPTrainer (object):
def __init__(self,
jdata,
run_opt):
paddle.set_device("cpu")
self.run_opt = run_opt
self._init_param(jdata)

Expand Down Expand Up @@ -387,6 +390,9 @@ def train (self,
% (self.cur_batch, train_time, test_time))
train_time = 0

if self.save_freq > 0 and self.cur_batch % self.save_freq == 0:
self.save_model(model_inputs, self.save_ckpt + "/model")

if self.run_opt.is_chief:
fp.close ()
if self.profiling and self.run_opt.is_chief :
Expand All @@ -400,15 +406,22 @@ def train (self,
def save_model(self, model_inputs_, folder_name_):
# Since "paddle.jit.to_static" modifiess the model in-place
# We have to make a temporary model copy to avoid damage to the original model.
model = copy.copy(self.model)
save_path = os.getcwd() + "/" + folder_name_
if self.fitting_type == "ener" and self.descrpt_type == "se_a":
input_names = ['coord', 'type', 'natoms_vec', 'box', 'default_mesh']
input_specs = [paddle.static.InputSpec(model_inputs_[name].shape, model_inputs_[name].dtype, name=name) for name in input_names]
else:
raise NotImplementedError

model = paddle.jit.to_static(self.model, input_spec=input_specs)
paddle.jit.save(model, save_path)
try:
model = paddle.jit.to_static(model, input_spec=input_specs)
paddle.jit.save(model, save_path)
except Exception as e:
raise e
finally:
del model
gc.collect()

log.info("saved checkpoint to %s" % (save_path))

Expand Down
1 change: 1 addition & 0 deletions examples/water/train/water_se_a.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
"disp_file": "lcurve.out",
"disp_freq": 100,
"numb_test": 10,
"save_freq": 1000,
"save_ckpt": "model.ckpt",
"load_ckpt": "model.ckpt",
"disp_training":true,
Expand Down
97 changes: 23 additions & 74 deletions source/op/paddle_ops/srcs/pd_prod_env_mat_multi_devices_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,6 @@ _prepare_coord_nlist_cpu(
const int &max_cpy_trial,
const int &max_nnei_trial);

// Numerical regression between CUDA 10.1 & CUDA 11.2
// Disable CUDA support until latest changes on
// /source/lib/src/cuda/xxx.cu get merged
/*
#ifdef PADDLE_WITH_CUDA
std::vector<paddle::Tensor> PdProdEnvMatAOpCUDAForward(
const paddle::Tensor &coord_tensor,
Expand All @@ -91,7 +87,6 @@ std::vector<paddle::Tensor> PdProdEnvMatAOpCUDAForward(
std::vector<int> sel_a,
std::vector<int> sel_r);
#endif
*/

template <typename data_t>
void PdProdEnvMatAOpCPUForwardKernel(
Expand Down Expand Up @@ -149,13 +144,13 @@ std::vector<paddle::Tensor> PdProdEnvMatAOpCPUForward(
std::vector<int> sel_a,
std::vector<int> sel_r)
{
CHECK_INPUT_READY(coord_tensor);
CHECK_INPUT_READY(type_tensor);
CHECK_INPUT_READY(natoms_tensor);
CHECK_INPUT_READY(box_tensor);
CHECK_INPUT_READY(mesh_tensor);
CHECK_INPUT_READY(avg_tensor);
CHECK_INPUT_READY(std_tensor);
CHECK_INPUT(coord_tensor);
CHECK_INPUT(type_tensor);
CHECK_INPUT(natoms_tensor);
CHECK_INPUT(box_tensor);
CHECK_INPUT(mesh_tensor);
CHECK_INPUT(avg_tensor);
CHECK_INPUT(std_tensor);

std::vector<int> sec_a;
std::vector<int> sec_r;
Expand Down Expand Up @@ -195,15 +190,7 @@ std::vector<paddle::Tensor> PdProdEnvMatAOpCPUForward(
PD_CHECK(sec_r.back() == 0, "Rotational free descriptor only support all-angular information: sel_r should be all zero.");
PD_CHECK(natoms_tensor.shape()[0] >= 3, "Number of atoms should be larger than (or equal to) 3");
// Paddle Set device on Python not in custom op

// TODO: This code should be removed once cuda issue fixed.
const int* natoms = nullptr;
if(natoms_tensor.place() != paddle::PlaceType::kCPU){
natoms = natoms_tensor.copy_to<int>(paddle::PlaceType::kCPU).data<int>();
}else{
natoms = natoms_tensor.data<int>();
}

const int *natoms = natoms_tensor.data<int>();
int nloc = natoms[0];
int nall = natoms[1];
int ntypes = natoms_tensor.shape()[0] - 2; //nloc and nall mean something.
Expand Down Expand Up @@ -256,41 +243,21 @@ std::vector<paddle::Tensor> PdProdEnvMatAOpCPUForward(
paddle::Tensor descrpt_deriv_tensor = paddle::Tensor(paddle::PlaceType::kCPU, descrpt_deriv_shape);
paddle::Tensor rij_tensor = paddle::Tensor(paddle::PlaceType::kCPU, rij_shape);
paddle::Tensor nlist_tensor = paddle::Tensor(paddle::PlaceType::kCPU, nlist_shape);

if(natoms_tensor.place() == paddle::PlaceType::kCPU) {
PD_DISPATCH_FLOATING_TYPES(
coord_tensor.type(), "pd_prod_env_mat_a_cpu_forward_kernel", ([&] {
PdProdEnvMatAOpCPUForwardKernel<data_t>(
nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei, max_nbor_size,
mesh_tensor.data<int>(), nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a, sec_r,
descrpt_tensor.mutable_data<data_t>(),
descrpt_deriv_tensor.mutable_data<data_t>(),
rij_tensor.mutable_data<data_t>(),
nlist_tensor.mutable_data<int>(),
coord_tensor.data<data_t>(),
box_tensor.data<data_t>(),
avg_tensor.data<data_t>(),
std_tensor.data<data_t>(),
type_tensor.data<int>());
}));
} else {
PD_DISPATCH_FLOATING_TYPES(
coord_tensor.type(), "pd_prod_env_mat_a_cpu_forward_kernel", ([&] {
PdProdEnvMatAOpCPUForwardKernel<data_t>(
nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei, max_nbor_size,
mesh_tensor.size() == 0 ? mesh_tensor.data<int>() : mesh_tensor.copy_to<int>(paddle::PlaceType::kCPU).data<int>(),
nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a, sec_r,
descrpt_tensor.mutable_data<data_t>(),
descrpt_deriv_tensor.mutable_data<data_t>(),
rij_tensor.mutable_data<data_t>(),
nlist_tensor.mutable_data<int>(),
coord_tensor.copy_to<data_t>(paddle::PlaceType::kCPU).data<data_t>(),
box_tensor.copy_to<data_t>(paddle::PlaceType::kCPU).data<data_t>(),
avg_tensor.copy_to<data_t>(paddle::PlaceType::kCPU).data<data_t>(),
std_tensor.copy_to<data_t>(paddle::PlaceType::kCPU).data<data_t>(),
type_tensor.copy_to<int>(paddle::PlaceType::kCPU).data<int>());
}));
}
PD_DISPATCH_FLOATING_TYPES(
coord_tensor.type(), "pd_prod_env_mat_a_cpu_forward_kernel", ([&] {
PdProdEnvMatAOpCPUForwardKernel<data_t>(
nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei, max_nbor_size,
mesh_tensor.data<int>(), nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a, sec_r,
descrpt_tensor.mutable_data<data_t>(),
descrpt_deriv_tensor.mutable_data<data_t>(),
rij_tensor.mutable_data<data_t>(),
nlist_tensor.mutable_data<int>(),
coord_tensor.data<data_t>(),
box_tensor.data<data_t>(),
avg_tensor.data<data_t>(),
std_tensor.data<data_t>(),
type_tensor.data<int>());
}));

return {descrpt_tensor, descrpt_deriv_tensor, rij_tensor, nlist_tensor};
}
Expand All @@ -315,23 +282,6 @@ std::vector<paddle::Tensor> PdProdEnvMatAOpForward(
CHECK_INPUT_READY(mesh_tensor);
CHECK_INPUT_READY(avg_tensor);
CHECK_INPUT_READY(std_tensor);

// Force dispatch to CPU until CUDA bug fixed
return PdProdEnvMatAOpCPUForward(
coord_tensor,
type_tensor,
natoms_tensor,
box_tensor,
mesh_tensor,
avg_tensor,
std_tensor,
rcut_a,
rcut_r,
rcut_r_smth,
sel_a,
sel_r
);
/*
if (coord_tensor.place() == paddle::PlaceType::kCPU) {
return PdProdEnvMatAOpCPUForward(
coord_tensor,
Expand Down Expand Up @@ -367,7 +317,6 @@ std::vector<paddle::Tensor> PdProdEnvMatAOpForward(
} else {
PD_THROW("Not implemented.");
}
*/
}
template <typename FPTYPE>
static void
Expand Down
50 changes: 12 additions & 38 deletions source/op/paddle_ops/srcs/pd_prod_force_se_a_multi_devices_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,6 @@



// Numerical regression between CUDA 10.1 & CUDA 11.2
// Disable CUDA support until latest changes on
// /source/lib/src/cuda/xxx.cu get merged
/*
#ifdef PADDLE_WITH_CUDA
std::vector<paddle::Tensor> PdProdForceSeAOpCUDAForward(
const paddle::Tensor& net_deriv_tensor,
Expand All @@ -23,7 +19,6 @@ const paddle::Tensor& natoms_tensor,
int n_a_sel,
int n_r_sel);
#endif
*/

template <typename data_t>
void PdProdForceSeAOpForwardCPUKernel(
Expand All @@ -49,24 +44,18 @@ const paddle::Tensor& natoms_tensor,
int n_a_sel,
int n_r_sel
){
CHECK_INPUT_READY(net_deriv_tensor);
CHECK_INPUT_READY(in_deriv_tensor);
CHECK_INPUT_READY(nlist_tensor);
CHECK_INPUT_READY(natoms_tensor);
CHECK_INPUT(net_deriv_tensor);
CHECK_INPUT(in_deriv_tensor);
CHECK_INPUT(nlist_tensor);
CHECK_INPUT(natoms_tensor);

CHECK_INPUT_DIM(net_deriv_tensor, 2);
CHECK_INPUT_DIM(in_deriv_tensor, 2);
CHECK_INPUT_DIM(nlist_tensor, 2);
CHECK_INPUT_DIM(natoms_tensor, 1);

PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3");
// TODO: This code should be removed once cuda issue fixed.
const int* natoms = nullptr;
if(natoms_tensor.place() != paddle::PlaceType::kCPU){
natoms = natoms_tensor.copy_to<int>(paddle::PlaceType::kCPU).data<int>();
}else{
natoms = natoms_tensor.data<int>();
}
const int* natoms = natoms_tensor.data<int>();
int nloc = natoms[0];
int nall = natoms[1];
int nframes = net_deriv_tensor.shape()[0];
Expand All @@ -90,24 +79,13 @@ int n_r_sel
assert (nloc * nnei == nlist_tensor.shape()[1]);
assert (nnei * 4 == ndescrpt);

if(natoms_tensor.place() == paddle::PlaceType::kCPU){
PD_DISPATCH_FLOATING_TYPES(
net_deriv_tensor.type(), "pd_prod_force_se_a_cpu_forward_kernel", ([&] {
PdProdForceSeAOpForwardCPUKernel<data_t>(
nloc, nall, nframes, ndescrpt, nnei,
force_tensor.mutable_data<data_t>(), net_deriv_tensor.data<data_t>(),
in_deriv_tensor.data<data_t>(), nlist_tensor.data<int>());
}));
} else {
PD_DISPATCH_FLOATING_TYPES(
net_deriv_tensor.type(), "pd_prod_force_se_a_cpu_forward_kernel", ([&] {
PdProdForceSeAOpForwardCPUKernel<data_t>(
nloc, nall, nframes, ndescrpt, nnei,
force_tensor.mutable_data<data_t>(), net_deriv_tensor.copy_to<data_t>(paddle::PlaceType::kCPU).data<data_t>(),
in_deriv_tensor.copy_to<data_t>(paddle::PlaceType::kCPU).data<data_t>(), nlist_tensor.copy_to<int>(paddle::PlaceType::kCPU).data<int>());
}));

}
PD_DISPATCH_FLOATING_TYPES(
net_deriv_tensor.type(), "pd_prod_force_se_a_cpu_forward_kernel", ([&] {
PdProdForceSeAOpForwardCPUKernel<data_t>(
nloc, nall, nframes, ndescrpt, nnei,
force_tensor.mutable_data<data_t>(), net_deriv_tensor.data<data_t>(),
in_deriv_tensor.data<data_t>(), nlist_tensor.data<int>());
}));

return {force_tensor};
}
Expand Down Expand Up @@ -221,9 +199,6 @@ const paddle::Tensor& nlist_tensor,
const paddle::Tensor& natoms_tensor,
int n_a_sel,
int n_r_sel){
// Force dispatch to CPU until CUDA bug fixed
return PdProdForceSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor, nlist_tensor, natoms_tensor, n_a_sel, n_r_sel);
/*
if(net_deriv_tensor.place() == paddle::PlaceType::kCPU){
return PdProdForceSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor, nlist_tensor, natoms_tensor, n_a_sel, n_r_sel);
#ifdef PADDLE_WITH_CUDA
Expand All @@ -233,7 +208,6 @@ int n_r_sel){
}else{
PD_THROW("No Such kernel for PdFrodForceSeAForward!");
}
*/
}

std::vector<paddle::Tensor> PdProdForceSeABackward(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@
#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")


// Numerical regression between CUDA 10.1 & CUDA 11.2
// Disable CUDA support until latest changes on
// /source/lib/src/cuda/xxx.cu get merged
/*
#ifdef PADDLE_WITH_CUDA
std::vector<paddle::Tensor> PdProdVirialSeAOpCUDAForward(
const paddle::Tensor& net_deriv_tensor,
Expand All @@ -23,7 +19,6 @@ const paddle::Tensor& natoms_tensor,
int n_a_sel,
int n_r_sel);
#endif
*/

template <typename data_t>
void PdProdVirialSeAOpForwardCPUKernel(
Expand Down
3 changes: 2 additions & 1 deletion source/tests/test_pd_prod_force_and_virial.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@
from tensorflow.python.framework import ops

from common import Data

if GLOBAL_NP_FLOAT_PRECISION == np.float32 :
global_default_fv_hh = 1e-2
global_default_dw_hh = 1e-2
global_default_places = 2
global_default_places = 3
else :
global_default_fv_hh = 1e-5
global_default_dw_hh = 1e-4
Expand Down

0 comments on commit e5aeb25

Please sign in to comment.