from transformers import PretrainedConfig class TinyRecursiveConfig(PretrainedConfig): model_type = "tiny_recursive" def __init__( self, vocab_size=50257, n_positions=1024, n_embd=512, n_head=8, n_physical_layers=2, n_loops=6, activation_function="gelu_new", resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, scale_attn_weights=True, scale_attn_by_inverse_layer_idx=False, reorder_and_upcast_attn=False, **kwargs, ): super().__init__(**kwargs) # Standard config self.vocab_size = vocab_size self.n_positions = n_positions self.n_embd = n_embd self.n_head = n_head self.n_physical_layers = n_physical_layers self.n_loops = n_loops self.activation_function = activation_function self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop self.layer_norm_epsilon = layer_norm_epsilon self.scale_attn_weights = scale_attn_weights self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx self.reorder_and_upcast_attn = reorder_and_upcast_attn # CRITICAL FIXES FOR COMPATIBILITY self.max_position_embeddings = n_positions self.hidden_size = n_embd self.num_attention_heads = n_head self.num_hidden_layers = n_physical_layers self.n_inner = None