JakeOh
/

LLaDA-70M-Base

@@ -1,13 +1,11 @@
 """
 LLaDA configuration
 """
-from transformers import AutoConfig, PretrainedConfig
-from enum import Enum
-from os import PathLike
-from typing import Union
 from dataclasses import asdict, dataclass, field
 from glob import glob
 from pathlib import Path
 from typing import (
     Any,
@@ -22,6 +20,7 @@ from typing import (
     cast,
 )
 __all__ = [
     "ActivationType",
@@ -127,7 +126,7 @@ class InitFnType(StrEnum):
 @dataclass
-class ModelConfig():
     """
     LLaDA (model) configuration.
     """
@@ -383,6 +382,7 @@ class ModelConfig():
                     "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
                 )
 class ActivationCheckpointingStrategy(StrEnum):
     whole_layer = "whole_layer"
     """
@@ -403,7 +403,7 @@ class ActivationCheckpointingStrategy(StrEnum):
     """
     Checkpoint one in four transformer layers.
     """
     two_in_three = "two_in_three"
     """
     Checkpoint two out of every three transformer layers.
@@ -439,11 +439,7 @@ class LLaDAConfig(PretrainedConfig):
         all_kwargs = model_config.__dict__
         all_kwargs.update(kwargs)
         all_kwargs.update({"use_cache": use_cache})
-        all_kwargs.update(
-            {
-                "architectures": all_kwargs.get("architectures", ["LLaDAModelLM"])
-            }
-        )
         super().__init__(**all_kwargs)
     @property

 """
 LLaDA configuration
 """
 from dataclasses import asdict, dataclass, field
+from enum import Enum
 from glob import glob
+from os import PathLike
 from pathlib import Path
 from typing import (
     Any,
     cast,
 )
+from transformers import AutoConfig, PretrainedConfig
 __all__ = [
     "ActivationType",
 @dataclass
+class ModelConfig:
     """
     LLaDA (model) configuration.
     """
                     "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
                 )
 class ActivationCheckpointingStrategy(StrEnum):
     whole_layer = "whole_layer"
     """
     """
     Checkpoint one in four transformer layers.
     """
     two_in_three = "two_in_three"
     """
     Checkpoint two out of every three transformer layers.
         all_kwargs = model_config.__dict__
         all_kwargs.update(kwargs)
         all_kwargs.update({"use_cache": use_cache})
+        all_kwargs.update({"architectures": all_kwargs.get("architectures", ["LLaDAModelLM"])})
         super().__init__(**all_kwargs)
     @property