diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 77091c3cf7..63463ceeef 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -159,7 +159,7 @@ def prepare_trainer_input_single( stat_file_path_single, ) - rank = dist.get_rank() if dist.is_initialized() else 0 + rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0 if not multi_task: ( train_data, diff --git a/deepmd/pt/optimizer/LKF.py b/deepmd/pt/optimizer/LKF.py index 06b341d987..6196414243 100644 --- a/deepmd/pt/optimizer/LKF.py +++ b/deepmd/pt/optimizer/LKF.py @@ -47,7 +47,7 @@ def __init__( # the first param, because this helps with casting in load_state_dict self._state = self.state[self._params[0]] self._state.setdefault("kalman_lambda", kalman_lambda) - self.dist_init = dist.is_initialized() + self.dist_init = dist.is_available() and dist.is_initialized() self.rank = dist.get_rank() if self.dist_init else 0 self.dindex = [] self.remainder = 0 diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 73404b0c83..1cdc383d01 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -122,8 +122,8 @@ def __init__( self.model_keys = ( list(model_params["model_dict"]) if self.multi_task else ["Default"] ) - self.rank = dist.get_rank() if dist.is_initialized() else 0 - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + self.rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0 + self.world_size = dist.get_world_size() if dist.is_available() and dist.is_initialized() else 1 self.num_model = len(self.model_keys) # Iteration config @@ -169,7 +169,7 @@ def get_dataloader_and_buffer(_data, _params): _data, sampler=_sampler, batch_size=None, - num_workers=NUM_WORKERS, # setting to 0 diverges the behavior of its iterator; should be >=1 + num_workers=NUM_WORKERS if dist.is_available() else 0, # setting to 0 diverges the behavior of its iterator; should be >=1 drop_last=False, pin_memory=True, ) @@ -607,7 +607,7 @@ def single_model_finetune( if shared_links is not None: self.wrapper.share_params(shared_links, resume=resuming or self.rank != 0) - if dist.is_initialized(): + if dist.is_available() and dist.is_initialized(): torch.cuda.set_device(LOCAL_RANK) # DDP will guarantee the model parameters are identical across all processes self.wrapper = DDP( @@ -673,7 +673,7 @@ def run(self): record_file = f"Sample_rank_{self.rank}.txt" fout1 = open(record_file, mode="w", buffering=1) log.info("Start to train %d steps.", self.num_steps) - if dist.is_initialized(): + if dist.is_available() and dist.is_initialized(): log.info(f"Rank: {dist.get_rank()}/{dist.get_world_size()}") if self.enable_tensorboard: from torch.utils.tensorboard import ( @@ -734,7 +734,7 @@ def step(_step_id, task_key="Default"): elif self.opt_type == "LKF": if isinstance(self.loss, EnergyStdLoss): KFOptWrapper = KFOptimizerWrapper( - self.wrapper, self.optimizer, 24, 6, dist.is_initialized() + self.wrapper, self.optimizer, 24, 6, dist.is_available() and dist.is_initialized() ) pref_e = self.opt_param["kf_start_pref_e"] * ( self.opt_param["kf_limit_pref_e"] @@ -753,7 +753,7 @@ def step(_step_id, task_key="Default"): # [coord, atype, natoms, mapping, shift, nlist, box] model_pred = {"energy": p_energy, "force": p_force} module = ( - self.wrapper.module if dist.is_initialized() else self.wrapper + self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper ) def fake_model(): @@ -768,10 +768,10 @@ def fake_model(): ) elif isinstance(self.loss, DenoiseLoss): KFOptWrapper = KFOptimizerWrapper( - self.wrapper, self.optimizer, 24, 6, dist.is_initialized() + self.wrapper, self.optimizer, 24, 6, dist.is_available() and dist.is_initialized() ) module = ( - self.wrapper.module if dist.is_initialized() else self.wrapper + self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper ) model_pred = KFOptWrapper.update_denoise_coord( input_dict, @@ -924,7 +924,7 @@ def log_loss_valid(_task_key="Default"): # Handle the case if rank 0 aborted and re-assigned self.latest_model = Path(self.save_ckpt + f"-{_step_id + 1}.pt") - module = self.wrapper.module if dist.is_initialized() else self.wrapper + module = self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper self.save_model(self.latest_model, lr=cur_lr, step=_step_id) log.info(f"Saved model to {self.latest_model}") symlink_prefix_files(self.latest_model.stem, self.save_ckpt) @@ -990,7 +990,7 @@ def log_loss_valid(_task_key="Default"): prof.stop() def save_model(self, save_path, lr=0.0, step=0): - module = self.wrapper.module if dist.is_initialized() else self.wrapper + module = self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper module.train_infos["lr"] = lr module.train_infos["step"] = step torch.save( diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index 361bc4b0b6..4705c6d0b4 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -97,7 +97,7 @@ def construct_dataset(system): with Pool( os.cpu_count() - // (int(os.environ["LOCAL_WORLD_SIZE"]) if dist.is_initialized() else 1) + // (int(os.environ["LOCAL_WORLD_SIZE"]) if dist.is_available() and dist.is_initialized() else 1) ) as pool: self.systems = pool.map(construct_dataset, systems) @@ -127,7 +127,7 @@ def construct_dataset(system): self.batch_sizes = batch_size * np.ones(len(systems), dtype=int) assert len(self.systems) == len(self.batch_sizes) for system, batch_size in zip(self.systems, self.batch_sizes): - if dist.is_initialized(): + if dist.is_available() and dist.is_initialized(): system_sampler = DistributedSampler(system) self.sampler_list.append(system_sampler) else: @@ -138,7 +138,7 @@ def construct_dataset(system): num_workers=0, # Should be 0 to avoid too many threads forked sampler=system_sampler, collate_fn=collate_batch, - shuffle=(not dist.is_initialized()) and shuffle, + shuffle=(not (dist.is_available() and dist.is_initialized())) and shuffle, ) self.dataloaders.append(system_dataloader) self.index.append(len(system_dataloader))