diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 1cdc383d01..030e9ffdfb 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -122,8 +122,14 @@ def __init__( self.model_keys = ( list(model_params["model_dict"]) if self.multi_task else ["Default"] ) - self.rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0 - self.world_size = dist.get_world_size() if dist.is_available() and dist.is_initialized() else 1 + self.rank = ( + dist.get_rank() if dist.is_available() and dist.is_initialized() else 0 + ) + self.world_size = ( + dist.get_world_size() + if dist.is_available() and dist.is_initialized() + else 1 + ) self.num_model = len(self.model_keys) # Iteration config @@ -169,7 +175,9 @@ def get_dataloader_and_buffer(_data, _params): _data, sampler=_sampler, batch_size=None, - num_workers=NUM_WORKERS if dist.is_available() else 0, # setting to 0 diverges the behavior of its iterator; should be >=1 + num_workers=NUM_WORKERS + if dist.is_available() + else 0, # setting to 0 diverges the behavior of its iterator; should be >=1 drop_last=False, pin_memory=True, ) @@ -734,7 +742,11 @@ def step(_step_id, task_key="Default"): elif self.opt_type == "LKF": if isinstance(self.loss, EnergyStdLoss): KFOptWrapper = KFOptimizerWrapper( - self.wrapper, self.optimizer, 24, 6, dist.is_available() and dist.is_initialized() + self.wrapper, + self.optimizer, + 24, + 6, + dist.is_available() and dist.is_initialized(), ) pref_e = self.opt_param["kf_start_pref_e"] * ( self.opt_param["kf_limit_pref_e"] @@ -753,7 +765,9 @@ def step(_step_id, task_key="Default"): # [coord, atype, natoms, mapping, shift, nlist, box] model_pred = {"energy": p_energy, "force": p_force} module = ( - self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper + self.wrapper.module + if dist.is_available() and dist.is_initialized() + else self.wrapper ) def fake_model(): @@ -768,10 +782,16 @@ def fake_model(): ) elif isinstance(self.loss, DenoiseLoss): KFOptWrapper = KFOptimizerWrapper( - self.wrapper, self.optimizer, 24, 6, dist.is_available() and dist.is_initialized() + self.wrapper, + self.optimizer, + 24, + 6, + dist.is_available() and dist.is_initialized(), ) module = ( - self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper + self.wrapper.module + if dist.is_available() and dist.is_initialized() + else self.wrapper ) model_pred = KFOptWrapper.update_denoise_coord( input_dict, @@ -924,7 +944,11 @@ def log_loss_valid(_task_key="Default"): # Handle the case if rank 0 aborted and re-assigned self.latest_model = Path(self.save_ckpt + f"-{_step_id + 1}.pt") - module = self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper + module = ( + self.wrapper.module + if dist.is_available() and dist.is_initialized() + else self.wrapper + ) self.save_model(self.latest_model, lr=cur_lr, step=_step_id) log.info(f"Saved model to {self.latest_model}") symlink_prefix_files(self.latest_model.stem, self.save_ckpt) @@ -990,7 +1014,11 @@ def log_loss_valid(_task_key="Default"): prof.stop() def save_model(self, save_path, lr=0.0, step=0): - module = self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper + module = ( + self.wrapper.module + if dist.is_available() and dist.is_initialized() + else self.wrapper + ) module.train_infos["lr"] = lr module.train_infos["step"] = step torch.save( diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index 4705c6d0b4..8ebe75868e 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -97,7 +97,11 @@ def construct_dataset(system): with Pool( os.cpu_count() - // (int(os.environ["LOCAL_WORLD_SIZE"]) if dist.is_available() and dist.is_initialized() else 1) + // ( + int(os.environ["LOCAL_WORLD_SIZE"]) + if dist.is_available() and dist.is_initialized() + else 1 + ) ) as pool: self.systems = pool.map(construct_dataset, systems) @@ -138,7 +142,8 @@ def construct_dataset(system): num_workers=0, # Should be 0 to avoid too many threads forked sampler=system_sampler, collate_fn=collate_batch, - shuffle=(not (dist.is_available() and dist.is_initialized())) and shuffle, + shuffle=(not (dist.is_available() and dist.is_initialized())) + and shuffle, ) self.dataloaders.append(system_dataloader) self.index.append(len(system_dataloader))