KitsuVp
/

NeoLLM

@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 """
-NeoLLM Model with FANformer Integration and Dropout Regularization
-Updated to include Fourier Analysis Network (FAN) layer for effective periodicity modeling
-and dropout regularization at strategic locations
 """
 import math
@@ -45,8 +45,6 @@ else:
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
 class FANLayer(nn.Module):
     """
     Fourier Analysis Network (FAN) layer for effective periodicity modeling.
@@ -63,26 +61,27 @@ class FANLayer(nn.Module):
         self.hidden_size = hidden_size
         self.fan_ratio = fan_ratio
-        # Calculate dimensions for periodic and non-periodic components
-        self.periodic_dim = int(hidden_size * fan_ratio)
-        self.non_periodic_dim = hidden_size - self.periodic_dim
-        # Projection matrices
-        self.Wp = nn.Linear(hidden_size, self.periodic_dim, bias=False)
-        self.Wp_bar = nn.Linear(hidden_size, self.non_periodic_dim, bias=True)
         # Initialize parameters
         self._init_weights()
     def _init_weights(self):
         """Initialize weights following the paper's recommendations."""
-        # Initialize Wp for periodic components
-        nn.init.normal_(self.Wp.weight, mean=0.0, std=0.02)
-        # Initialize Wp_bar for non-periodic components
-        nn.init.normal_(self.Wp_bar.weight, mean=0.0, std=0.02)
-        if self.Wp_bar.bias is not None:
-            nn.init.zeros_(self.Wp_bar.bias)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
@@ -93,17 +92,14 @@ class FANLayer(nn.Module):
         Returns:
             Transformed tensor with Fourier components concatenated
         """
-        # Get periodic components
-        x_periodic = self.Wp(x)  # (batch, seq_len, periodic_dim)
-        cos_component = torch.cos(x_periodic)
-        sin_component = torch.sin(x_periodic)
-        # Get non-periodic component (linear transformation)
-        x_non_periodic = self.Wp_bar(x)  # (batch, seq_len, non_periodic_dim)
         # Concatenate all components: [cos(WpX) || sin(WpX) || (Wp¯X + Bp¯)]
-        x_fan = torch.cat([cos_component, sin_component, x_non_periodic], dim=-1)
         return x_fan
@@ -287,7 +283,7 @@ def eager_attention_forward(
 class NeoLLMAttention(nn.Module):
-    """Multi-headed attention with FANformer integration for periodicity modeling"""
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
@@ -338,8 +334,10 @@ class NeoLLMAttention(nn.Module):
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
         hidden_shape = (*input_shape, -1, self.head_dim)
         query_states, gate = torch.chunk(
             self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
         )
@@ -537,7 +535,7 @@ def torch_recurrent_gated_delta_rule(
     return core_attn_out, last_recurrent_state
 class NeoLLMGatedDeltaNet(nn.Module):
-    """Linear attention with FANformer integration for periodicity modeling"""
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
@@ -659,7 +657,8 @@ class NeoLLMGatedDeltaNet(nn.Module):
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
         projected_states_qkvz = self.in_proj_qkvz(hidden_states_fan)
         projected_states_ba = self.in_proj_ba(hidden_states_fan)
         query, key, value, z, b, a = self.fix_query_key_value_ordering(projected_states_qkvz, projected_states_ba)
@@ -737,6 +736,7 @@ class PolyNorm(torch.nn.Module):
     def forward(self, x):
         return self.weight[0] * self._norm(x**3) + self.weight[1] * self._norm(x**2) + self.weight[2] * self._norm(x) + self.bias
 class NeoLLMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -817,7 +817,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
                 **kwargs,
             )
-        # Residual connection
         hidden_states = residual + hidden_states
         # Apply GPAS after attention residual connection
@@ -832,7 +832,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         hidden_states = self.mlp(hidden_states)
-        # Residual connection
         hidden_states = residual + hidden_states
         # Apply GPAS after MLP residual connection
@@ -867,6 +867,8 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
     def __init__(self, config: NeoLLMConfig):
         super().__init__(config)
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
         self.layers = nn.ModuleList(
             [NeoLLMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
@@ -934,6 +936,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         if attention_mask is not None and torch.all(attention_mask == 1):
             linear_attn_mask = None
         return linear_attn_mask
 @torch.compiler.disable
 def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, pad_token_id=None):
     """
@@ -957,7 +960,7 @@ def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, p
         processed_labels,
         bias=lm_head_bias,
         shift=1,
-        impl="cce",
         reduction="mean"
     )
@@ -1015,6 +1018,7 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [
@@ -1025,8 +1029,7 @@ __all__ = [
     "FANLayer",
 ]
 # Register the configuration and model for AutoClass support
 AutoConfig.register("neollm", NeoLLMConfig)
 AutoModel.register(NeoLLMConfig, NeoLLMModel)
-AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)

 #!/usr/bin/env python3
 """
+NeoLLM Model with FANformer Integration, Dropout Regularization, and Selective Self-Attention (SSA)
+Updated to include Fourier Analysis Network (FAN) layer for effective periodicity modeling,
+dropout regularization at strategic locations
 """
 import math
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
 class FANLayer(nn.Module):
     """
     Fourier Analysis Network (FAN) layer for effective periodicity modeling.
         self.hidden_size = hidden_size
         self.fan_ratio = fan_ratio
+        # Calculate dimensions following the paper's approach
+        # Output will be: [cos(p) || sin(p) || g] where total = hidden_size + periodic_dim
+        output_dim = hidden_size + int(hidden_size * fan_ratio)
+        self.p_output_dim = int(output_dim * fan_ratio)
+        self.g_output_dim = output_dim - self.p_output_dim * 2
+        # Single fused projection (more efficient than two separate projections)
+        self.input_linear = nn.Linear(
+            hidden_size,
+            self.p_output_dim + self.g_output_dim,
+            bias=True
+        )
         # Initialize parameters
         self._init_weights()
     def _init_weights(self):
         """Initialize weights following the paper's recommendations."""
+        nn.init.normal_(self.input_linear.weight, mean=0.0, std=0.02)
+        if self.input_linear.bias is not None:
+            nn.init.zeros_(self.input_linear.bias)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Returns:
             Transformed tensor with Fourier components concatenated
+            Shape: (batch, seq_len, hidden_size + periodic_dim)
         """
+        # Single projection followed by split (more efficient)
+        pg = self.input_linear(x)
+        p, g = torch.split(pg, [self.p_output_dim, self.g_output_dim], dim=-1)
         # Concatenate all components: [cos(WpX) || sin(WpX) || (Wp¯X + Bp¯)]
+        x_fan = torch.cat([torch.cos(p), torch.sin(p), g], dim=-1)
         return x_fan
 class NeoLLMAttention(nn.Module):
+    """Multi-headed attention with FANformer integration and Selective Self-Attention for periodicity modeling"""
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
         hidden_shape = (*input_shape, -1, self.head_dim)
+        # Use FAN-transformed features directly for projections
         query_states, gate = torch.chunk(
             self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
         )
     return core_attn_out, last_recurrent_state
 class NeoLLMGatedDeltaNet(nn.Module):
+    """Linear attention with FANformer integration and Selective Self-Attention for periodicity modeling"""
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
+        # Use FAN-transformed features directly for projections
         projected_states_qkvz = self.in_proj_qkvz(hidden_states_fan)
         projected_states_ba = self.in_proj_ba(hidden_states_fan)
         query, key, value, z, b, a = self.fix_query_key_value_ordering(projected_states_qkvz, projected_states_ba)
     def forward(self, x):
         return self.weight[0] * self._norm(x**3) + self.weight[1] * self._norm(x**2) + self.weight[2] * self._norm(x) + self.bias
 class NeoLLMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
                 **kwargs,
             )
+        # Standard residual connection
         hidden_states = residual + hidden_states
         # Apply GPAS after attention residual connection
         hidden_states = self.mlp(hidden_states)
+        # Standard residual connection
         hidden_states = residual + hidden_states
         # Apply GPAS after MLP residual connection
     def __init__(self, config: NeoLLMConfig):
         super().__init__(config)
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        # Each layer creates its own components (no shared parameters)
         self.layers = nn.ModuleList(
             [NeoLLMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         if attention_mask is not None and torch.all(attention_mask == 1):
             linear_attn_mask = None
         return linear_attn_mask
 @torch.compiler.disable
 def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, pad_token_id=None):
     """
         processed_labels,
         bias=lm_head_bias,
         shift=1,
+        impl="cce_kahan_full_c",
         reduction="mean"
     )
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [
     "FANLayer",
 ]
 # Register the configuration and model for AutoClass support
 AutoConfig.register("neollm", NeoLLMConfig)
 AutoModel.register(NeoLLMConfig, NeoLLMModel)
+AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)