breaking(pt/tf/dp): disable bias in type embedding (#3958)

This PR addresses an issue observed during training with DPA2 on complex datasets, such as `mptraj`. Specifically, the **learning curves of energy** from the **2024Q1-based branch** and the **devel branch** show significant differences at the very beginning when setting `tebd_dim` = 256 (and thus descriptor `dim_out` = 128 + 256). The issue is illustrated in the following image: <img src="https://github.com/deepmodeling/deepmd-kit/assets/50307526/701835a4-126f-4a93-91c7-f9e685c4dc9d" alt="Example Image" width="500"> After removing the bias in the type embedding, which affects the standard deviation of the descriptor when `tebd_dim` is very large, the learning curve improves significantly: <img src="https://github.com/deepmodeling/deepmd-kit/assets/50307526/8915e7dd-1813-42bc-8617-fe8209bc6da1" alt="Example Image" width="500"> Notably, this behavior is not prominent when using a `tebd_dim` that is relatively smaller than the descriptor itself, such as when using DPA2 with `tebd_dim` = 8 or using DPA1. The same issue exists in econf of type embedding, which will be solved in a separated PR. **NOTE** **This PR disables bias in type embedding in all backends, which is a breaking change.**  ## Summary by CodeRabbit - **New Features** - Introduced `use_tebd_bias` and `bias` parameters across various components to control the use of bias in type embeddings and networks. - **Updates** - Updated serialization and deserialization methods to include the new parameters and ensure version compatibility.
deepmodeling · Jul 11, 2024 · 86f6e84 · 86f6e84
1 parent b86165d
commit 86f6e84
Show file tree

Hide file tree

Showing 33 changed files with 280 additions and 43 deletions.
diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
@@ -201,6 +201,8 @@ class DescrptDPA1(NativeOP, BaseDescriptor):
             The default value is `None`, which means the `tebd_input_mode` setting will be used instead.
     use_econf_tebd: bool, Optional
             Whether to use electronic configuration type embedding.
+    use_tebd_bias : bool, Optional
+            Whether to use bias in the type embedding layer.
     type_map: List[str], Optional
             A list of strings. Give the name to each type of atoms.
     spin
@@ -253,6 +255,7 @@ def __init__(
         spin: Optional[Any] = None,
         stripped_type_embedding: Optional[bool] = None,
         use_econf_tebd: bool = False,
+        use_tebd_bias: bool = False,
         type_map: Optional[List[str]] = None,
         # consistent with argcheck, not used though
         seed: Optional[Union[int, List[int]]] = None,
@@ -301,6 +304,7 @@ def __init__(
             seed=child_seed(seed, 0),
         )
         self.use_econf_tebd = use_econf_tebd
+        self.use_tebd_bias = use_tebd_bias
         self.type_map = type_map
         self.type_embedding = TypeEmbedNet(
             ntypes=ntypes,
@@ -309,6 +313,7 @@ def __init__(
             activation_function="Linear",
             precision=precision,
             use_econf_tebd=use_econf_tebd,
+            use_tebd_bias=use_tebd_bias,
             type_map=type_map,
             seed=child_seed(seed, 1),
         )
@@ -491,7 +496,7 @@ def serialize(self) -> dict:
         data = {
             "@class": "Descriptor",
             "type": "dpa1",
-            "@version": 1,
+            "@version": 2,
             "rcut": obj.rcut,
             "rcut_smth": obj.rcut_smth,
             "sel": obj.sel,
@@ -516,6 +521,7 @@ def serialize(self) -> dict:
             "type_one_side": obj.type_one_side,
             "concat_output_tebd": self.concat_output_tebd,
             "use_econf_tebd": self.use_econf_tebd,
+            "use_tebd_bias": self.use_tebd_bias,
             "type_map": self.type_map,
             # make deterministic
             "precision": np.dtype(PRECISION_DICT[obj.precision]).name,
@@ -541,7 +547,7 @@ def serialize(self) -> dict:
     def deserialize(cls, data: dict) -> "DescrptDPA1":
         """Deserialize from dict."""
         data = data.copy()
-        check_version_compatibility(data.pop("@version"), 1, 1)
+        check_version_compatibility(data.pop("@version"), 2, 1)
         data.pop("@class")
         data.pop("type")
         variables = data.pop("@variables")
@@ -554,6 +560,9 @@ def deserialize(cls, data: dict) -> "DescrptDPA1":
             embeddings_strip = data.pop("embeddings_strip")
         else:
             embeddings_strip = None
+        # compat with version 1
+        if "use_tebd_bias" not in data:
+            data["use_tebd_bias"] = True
         obj = cls(**data)
 
         obj.se_atten["davg"] = variables["davg"]

diff --git a/deepmd/dpmodel/descriptor/dpa2.py b/deepmd/dpmodel/descriptor/dpa2.py
@@ -331,6 +331,7 @@ def __init__(
         seed: Optional[Union[int, List[int]]] = None,
         add_tebd_to_repinit_out: bool = False,
         use_econf_tebd: bool = False,
+        use_tebd_bias: bool = False,
         type_map: Optional[List[str]] = None,
     ):
         r"""The DPA-2 descriptor. see https://arxiv.org/abs/2312.15492.
@@ -361,6 +362,8 @@ def __init__(
             Whether to add type embedding to the output representation from repinit before inputting it into repformer.
         use_econf_tebd : bool, Optional
             Whether to use electronic configuration type embedding.
+        use_tebd_bias : bool, Optional
+            Whether to use bias in the type embedding layer.
         type_map : List[str], Optional
             A list of strings. Give the name to each type of atoms.
 
@@ -449,6 +452,7 @@ def init_subclass_params(sub_data, sub_class):
             seed=child_seed(seed, 1),
         )
         self.use_econf_tebd = use_econf_tebd
+        self.use_tebd_bias = use_tebd_bias
         self.type_map = type_map
         self.type_embedding = TypeEmbedNet(
             ntypes=ntypes,
@@ -457,6 +461,7 @@ def init_subclass_params(sub_data, sub_class):
             activation_function="Linear",
             precision=precision,
             use_econf_tebd=use_econf_tebd,
+            use_tebd_bias=use_tebd_bias,
             type_map=type_map,
             seed=child_seed(seed, 2),
         )
@@ -720,7 +725,7 @@ def serialize(self) -> dict:
         data = {
             "@class": "Descriptor",
             "type": "dpa2",
-            "@version": 1,
+            "@version": 2,
             "ntypes": self.ntypes,
             "repinit_args": self.repinit_args.serialize(),
             "repformer_args": self.repformer_args.serialize(),
@@ -732,6 +737,7 @@ def serialize(self) -> dict:
             "trainable": self.trainable,
             "add_tebd_to_repinit_out": self.add_tebd_to_repinit_out,
             "use_econf_tebd": self.use_econf_tebd,
+            "use_tebd_bias": self.use_tebd_bias,
             "type_map": self.type_map,
             "type_embedding": self.type_embedding.serialize(),
             "g1_shape_tranform": self.g1_shape_tranform.serialize(),
@@ -774,7 +780,7 @@ def serialize(self) -> dict:
     @classmethod
     def deserialize(cls, data: dict) -> "DescrptDPA2":
         data = data.copy()
-        check_version_compatibility(data.pop("@version"), 1, 1)
+        check_version_compatibility(data.pop("@version"), 2, 1)
         data.pop("@class")
         data.pop("type")
         repinit_variable = data.pop("repinit_variable").copy()
@@ -785,6 +791,9 @@ def deserialize(cls, data: dict) -> "DescrptDPA2":
         add_tebd_to_repinit_out = data["add_tebd_to_repinit_out"]
         data["repinit"] = RepinitArgs(**data.pop("repinit_args"))
         data["repformer"] = RepformerArgs(**data.pop("repformer_args"))
+        # compat with version 1
+        if "use_tebd_bias" not in data:
+            data["use_tebd_bias"] = True
         obj = cls(**data)
         obj.type_embedding = TypeEmbedNet.deserialize(type_embedding)
         if add_tebd_to_repinit_out:

diff --git a/deepmd/dpmodel/descriptor/se_atten_v2.py b/deepmd/dpmodel/descriptor/se_atten_v2.py
@@ -64,6 +64,7 @@ def __init__(
         spin: Optional[Any] = None,
         stripped_type_embedding: Optional[bool] = None,
         use_econf_tebd: bool = False,
+        use_tebd_bias: bool = False,
         type_map: Optional[List[str]] = None,
         # consistent with argcheck, not used though
         seed: Optional[Union[int, List[int]]] = None,
@@ -100,6 +101,7 @@ def __init__(
             spin=spin,
             stripped_type_embedding=stripped_type_embedding,
             use_econf_tebd=use_econf_tebd,
+            use_tebd_bias=use_tebd_bias,
             type_map=type_map,
             # consistent with argcheck, not used though
             seed=seed,
@@ -111,7 +113,7 @@ def serialize(self) -> dict:
         data = {
             "@class": "Descriptor",
             "type": "se_atten_v2",
-            "@version": 1,
+            "@version": 2,
             "rcut": obj.rcut,
             "rcut_smth": obj.rcut_smth,
             "sel": obj.sel,
@@ -134,6 +136,7 @@ def serialize(self) -> dict:
             "type_one_side": obj.type_one_side,
             "concat_output_tebd": self.concat_output_tebd,
             "use_econf_tebd": self.use_econf_tebd,
+            "use_tebd_bias": self.use_tebd_bias,
             "type_map": self.type_map,
             # make deterministic
             "precision": np.dtype(PRECISION_DICT[obj.precision]).name,
@@ -158,7 +161,7 @@ def serialize(self) -> dict:
     def deserialize(cls, data: dict) -> "DescrptSeAttenV2":
         """Deserialize from dict."""
         data = data.copy()
-        check_version_compatibility(data.pop("@version"), 1, 1)
+        check_version_compatibility(data.pop("@version"), 2, 1)
         data.pop("@class")
         data.pop("type")
         variables = data.pop("@variables")
@@ -167,6 +170,9 @@ def deserialize(cls, data: dict) -> "DescrptSeAttenV2":
         attention_layers = data.pop("attention_layers")
         data.pop("env_mat")
         embeddings_strip = data.pop("embeddings_strip")
+        # compat with version 1
+        if "use_tebd_bias" not in data:
+            data["use_tebd_bias"] = True
         obj = cls(**data)
 
         obj.se_atten["davg"] = variables["davg"]

diff --git a/deepmd/dpmodel/utils/network.py b/deepmd/dpmodel/utils/network.py
@@ -571,6 +571,8 @@ class EN(T_Network):
             Floating point precision for the model paramters.
         seed : int, optional
             Random seed.
+        bias : bool, Optional
+            Whether to use bias in the embedding layer.
         """
 
         def __init__(
@@ -581,6 +583,7 @@ def __init__(
             resnet_dt: bool = False,
             precision: str = DEFAULT_PRECISION,
             seed: Optional[Union[int, List[int]]] = None,
+            bias: bool = True,
         ):
             layers = []
             i_in = in_dim
@@ -590,7 +593,7 @@ def __init__(
                     T_NetworkLayer(
                         i_in,
                         i_ot,
-                        bias=True,
+                        bias=bias,
                         use_timestep=resnet_dt,
                         activation_function=activation_function,
                         resnet=True,
@@ -605,6 +608,7 @@ def __init__(
             self.activation_function = activation_function
             self.resnet_dt = resnet_dt
             self.precision = precision
+            self.bias = bias
 
         def serialize(self) -> dict:
             """Serialize the network to a dict.
@@ -616,11 +620,12 @@ def serialize(self) -> dict:
             """
             return {
                 "@class": "EmbeddingNetwork",
-                "@version": 1,
+                "@version": 2,
                 "in_dim": self.in_dim,
                 "neuron": self.neuron.copy(),
                 "activation_function": self.activation_function,
                 "resnet_dt": self.resnet_dt,
+                "bias": self.bias,
                 # make deterministic
                 "precision": np.dtype(PRECISION_DICT[self.precision]).name,
                 "layers": [layer.serialize() for layer in self.layers],
@@ -636,7 +641,7 @@ def deserialize(cls, data: dict) -> "EmbeddingNet":
                 The dict to deserialize from.
             """
             data = copy.deepcopy(data)
-            check_version_compatibility(data.pop("@version", 1), 1, 1)
+            check_version_compatibility(data.pop("@version", 1), 2, 1)
             data.pop("@class", None)
             layers = data.pop("layers")
             obj = cls(**data)
@@ -691,6 +696,7 @@ def __init__(
                 activation_function=activation_function,
                 resnet_dt=resnet_dt,
                 precision=precision,
+                seed=seed,
             )
             i_in = neuron[-1] if len(neuron) > 0 else in_dim
             i_ot = out_dim

diff --git a/deepmd/dpmodel/utils/type_embed.py b/deepmd/dpmodel/utils/type_embed.py
@@ -45,6 +45,8 @@ class TypeEmbedNet(NativeOP):
         Concat the zero padding to the output, as the default embedding of empty type.
     use_econf_tebd: bool, Optional
         Whether to use electronic configuration type embedding.
+    use_tebd_bias : bool, Optional
+        Whether to use bias in the type embedding layer.
     type_map: List[str], Optional
         A list of strings. Give the name to each type of atoms.
     """
@@ -61,6 +63,7 @@ def __init__(
         seed: Optional[Union[int, List[int]]] = None,
         padding: bool = False,
         use_econf_tebd: bool = False,
+        use_tebd_bias: bool = False,
         type_map: Optional[List[str]] = None,
     ) -> None:
         self.ntypes = ntypes
@@ -72,6 +75,7 @@ def __init__(
         self.trainable = trainable
         self.padding = padding
         self.use_econf_tebd = use_econf_tebd
+        self.use_tebd_bias = use_tebd_bias
         self.type_map = type_map
         embed_input_dim = ntypes
         if self.use_econf_tebd:
@@ -85,6 +89,7 @@ def __init__(
             self.resnet_dt,
             self.precision,
             seed=self.seed,
+            bias=self.use_tebd_bias,
         )
 
     def call(self) -> np.ndarray:
@@ -114,11 +119,14 @@ def deserialize(cls, data: dict):
             The deserialized model
         """
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 1, 1)
+        check_version_compatibility(data.pop("@version", 1), 2, 1)
         data_cls = data.pop("@class")
         assert data_cls == "TypeEmbedNet", f"Invalid class {data_cls}"
 
         embedding_net = EmbeddingNet.deserialize(data.pop("embedding"))
+        # compat with version 1
+        if "use_tebd_bias" not in data:
+            data["use_tebd_bias"] = True
         type_embedding_net = cls(**data)
         type_embedding_net.embedding_net = embedding_net
         return type_embedding_net
@@ -133,7 +141,7 @@ def serialize(self) -> dict:
         """
         return {
             "@class": "TypeEmbedNet",
-            "@version": 1,
+            "@version": 2,
             "ntypes": self.ntypes,
             "neuron": self.neuron,
             "resnet_dt": self.resnet_dt,
@@ -142,6 +150,7 @@ def serialize(self) -> dict:
             "trainable": self.trainable,
             "padding": self.padding,
             "use_econf_tebd": self.use_econf_tebd,
+            "use_tebd_bias": self.use_tebd_bias,
             "type_map": self.type_map,
             "embedding": self.embedding_net.serialize(),
         }

diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py
@@ -189,6 +189,8 @@ class DescrptDPA1(BaseDescriptor, torch.nn.Module):
             Random seed for parameter initialization.
     use_econf_tebd: bool, Optional
             Whether to use electronic configuration type embedding.
+    use_tebd_bias : bool, Optional
+            Whether to use bias in the type embedding layer.
     type_map: List[str], Optional
             A list of strings. Give the name to each type of atoms.
     spin
@@ -241,6 +243,7 @@ def __init__(
         stripped_type_embedding: Optional[bool] = None,
         seed: Optional[Union[int, List[int]]] = None,
         use_econf_tebd: bool = False,
+        use_tebd_bias: bool = False,
         type_map: Optional[List[str]] = None,
         # not implemented
         spin=None,
@@ -293,13 +296,15 @@ def __init__(
             old_impl=old_impl,
         )
         self.use_econf_tebd = use_econf_tebd
+        self.use_tebd_bias = use_tebd_bias
         self.type_map = type_map
         self.type_embedding = TypeEmbedNet(
             ntypes,
             tebd_dim,
             precision=precision,
             seed=child_seed(seed, 2),
             use_econf_tebd=use_econf_tebd,
+            use_tebd_bias=use_tebd_bias,
             type_map=type_map,
         )
         self.tebd_dim = tebd_dim
@@ -462,7 +467,7 @@ def serialize(self) -> dict:
         data = {
             "@class": "Descriptor",
             "type": "dpa1",
-            "@version": 1,
+            "@version": 2,
             "rcut": obj.rcut,
             "rcut_smth": obj.rcut_smth,
             "sel": obj.sel,
@@ -487,6 +492,7 @@ def serialize(self) -> dict:
             "type_one_side": obj.type_one_side,
             "concat_output_tebd": self.concat_output_tebd,
             "use_econf_tebd": self.use_econf_tebd,
+            "use_tebd_bias": self.use_tebd_bias,
             "type_map": self.type_map,
             # make deterministic
             "precision": RESERVED_PRECISON_DICT[obj.prec],
@@ -510,7 +516,7 @@ def serialize(self) -> dict:
     @classmethod
     def deserialize(cls, data: dict) -> "DescrptDPA1":
         data = data.copy()
-        check_version_compatibility(data.pop("@version"), 1, 1)
+        check_version_compatibility(data.pop("@version"), 2, 1)
         data.pop("@class")
         data.pop("type")
         variables = data.pop("@variables")
@@ -523,6 +529,9 @@ def deserialize(cls, data: dict) -> "DescrptDPA1":
             embeddings_strip = data.pop("embeddings_strip")
         else:
             embeddings_strip = None
+        # compat with version 1
+        if "use_tebd_bias" not in data:
+            data["use_tebd_bias"] = True
         obj = cls(**data)
 
         def t_cvt(xx):