Fix error when distributed training and data loading is unavailable

deepmodeling · Apr 7, 2024 · 096db6a · 096db6a
1 parent 0a2995f
commit 096db6a
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 16 deletions.
diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py
@@ -159,7 +159,7 @@ def prepare_trainer_input_single(
             stat_file_path_single,
         )
 
-    rank = dist.get_rank() if dist.is_initialized() else 0
+    rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
     if not multi_task:
         (
             train_data,

diff --git a/deepmd/pt/optimizer/LKF.py b/deepmd/pt/optimizer/LKF.py
@@ -47,7 +47,7 @@ def __init__(
         # the first param, because this helps with casting in load_state_dict
         self._state = self.state[self._params[0]]
         self._state.setdefault("kalman_lambda", kalman_lambda)
-        self.dist_init = dist.is_initialized()
+        self.dist_init = dist.is_available() and dist.is_initialized()
         self.rank = dist.get_rank() if self.dist_init else 0
         self.dindex = []
         self.remainder = 0

diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
@@ -122,8 +122,8 @@ def __init__(
         self.model_keys = (
             list(model_params["model_dict"]) if self.multi_task else ["Default"]
         )
-        self.rank = dist.get_rank() if dist.is_initialized() else 0
-        self.world_size = dist.get_world_size() if dist.is_initialized() else 1
+        self.rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
+        self.world_size = dist.get_world_size() if dist.is_available() and dist.is_initialized() else 1
         self.num_model = len(self.model_keys)
 
         # Iteration config
@@ -169,7 +169,7 @@ def get_dataloader_and_buffer(_data, _params):
                     _data,
                     sampler=_sampler,
                     batch_size=None,
-                    num_workers=NUM_WORKERS,  # setting to 0 diverges the behavior of its iterator; should be >=1
+                    num_workers=NUM_WORKERS if dist.is_available() else 0,  # setting to 0 diverges the behavior of its iterator; should be >=1
                     drop_last=False,
                     pin_memory=True,
                 )
@@ -607,7 +607,7 @@ def single_model_finetune(
         if shared_links is not None:
             self.wrapper.share_params(shared_links, resume=resuming or self.rank != 0)
 
-        if dist.is_initialized():
+        if dist.is_available() and dist.is_initialized():
             torch.cuda.set_device(LOCAL_RANK)
             # DDP will guarantee the model parameters are identical across all processes
             self.wrapper = DDP(
@@ -673,7 +673,7 @@ def run(self):
             record_file = f"Sample_rank_{self.rank}.txt"
             fout1 = open(record_file, mode="w", buffering=1)
         log.info("Start to train %d steps.", self.num_steps)
-        if dist.is_initialized():
+        if dist.is_available() and dist.is_initialized():
             log.info(f"Rank: {dist.get_rank()}/{dist.get_world_size()}")
         if self.enable_tensorboard:
             from torch.utils.tensorboard import (
@@ -734,7 +734,7 @@ def step(_step_id, task_key="Default"):
             elif self.opt_type == "LKF":
                 if isinstance(self.loss, EnergyStdLoss):
                     KFOptWrapper = KFOptimizerWrapper(
-                        self.wrapper, self.optimizer, 24, 6, dist.is_initialized()
+                        self.wrapper, self.optimizer, 24, 6, dist.is_available() and dist.is_initialized()
                     )
                     pref_e = self.opt_param["kf_start_pref_e"] * (
                         self.opt_param["kf_limit_pref_e"]
@@ -753,7 +753,7 @@ def step(_step_id, task_key="Default"):
                     # [coord, atype, natoms, mapping, shift, nlist, box]
                     model_pred = {"energy": p_energy, "force": p_force}
                     module = (
-                        self.wrapper.module if dist.is_initialized() else self.wrapper
+                        self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper
                     )
 
                     def fake_model():
@@ -768,10 +768,10 @@ def fake_model():
                     )
                 elif isinstance(self.loss, DenoiseLoss):
                     KFOptWrapper = KFOptimizerWrapper(
-                        self.wrapper, self.optimizer, 24, 6, dist.is_initialized()
+                        self.wrapper, self.optimizer, 24, 6, dist.is_available() and dist.is_initialized()
                     )
                     module = (
-                        self.wrapper.module if dist.is_initialized() else self.wrapper
+                        self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper
                     )
                     model_pred = KFOptWrapper.update_denoise_coord(
                         input_dict,
@@ -924,7 +924,7 @@ def log_loss_valid(_task_key="Default"):
                 # Handle the case if rank 0 aborted and re-assigned
                 self.latest_model = Path(self.save_ckpt + f"-{_step_id + 1}.pt")
 
-                module = self.wrapper.module if dist.is_initialized() else self.wrapper
+                module = self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper
                 self.save_model(self.latest_model, lr=cur_lr, step=_step_id)
                 log.info(f"Saved model to {self.latest_model}")
                 symlink_prefix_files(self.latest_model.stem, self.save_ckpt)
@@ -990,7 +990,7 @@ def log_loss_valid(_task_key="Default"):
             prof.stop()
 
     def save_model(self, save_path, lr=0.0, step=0):
-        module = self.wrapper.module if dist.is_initialized() else self.wrapper
+        module = self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper
         module.train_infos["lr"] = lr
         module.train_infos["step"] = step
         torch.save(

diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py
@@ -97,7 +97,7 @@ def construct_dataset(system):
 
         with Pool(
             os.cpu_count()
-            // (int(os.environ["LOCAL_WORLD_SIZE"]) if dist.is_initialized() else 1)
+            // (int(os.environ["LOCAL_WORLD_SIZE"]) if dist.is_available() and dist.is_initialized() else 1)
         ) as pool:
             self.systems = pool.map(construct_dataset, systems)
 
@@ -127,7 +127,7 @@ def construct_dataset(system):
             self.batch_sizes = batch_size * np.ones(len(systems), dtype=int)
         assert len(self.systems) == len(self.batch_sizes)
         for system, batch_size in zip(self.systems, self.batch_sizes):
-            if dist.is_initialized():
+            if dist.is_available() and dist.is_initialized():
                 system_sampler = DistributedSampler(system)
                 self.sampler_list.append(system_sampler)
             else:
@@ -138,7 +138,7 @@ def construct_dataset(system):
                 num_workers=0,  # Should be 0 to avoid too many threads forked
                 sampler=system_sampler,
                 collate_fn=collate_batch,
-                shuffle=(not dist.is_initialized()) and shuffle,
+                shuffle=(not (dist.is_available() and dist.is_initialized())) and shuffle,
             )
             self.dataloaders.append(system_dataloader)
             self.index.append(len(system_dataloader))