NeoLLM

File size: 48,834 Bytes

#!/usr/bin/env python3
"""
NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization, 
SeeDNorm (Self-Rescaled Dynamic Normalization), and ResFormer Value Residual Learning 
for enhanced information flow through deep layers.

Updated to include:
- Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
- FAN layer in FFN for featural periodicity modeling (complementary coverage)
- SeeDNorm: Dynamic normalization with input-dependent scaling for better adaptability
- Dropout regularization at strategic locations
- ResFormer: Feature residual connections from first layer (applied before projections)
"""

import math
from typing import Any, Callable, Optional, Union

import torch
import torch.nn.functional as F
from torch import nn
from cut_cross_entropy import linear_cross_entropy

from transformers.activations import ACT2FN
from transformers.generation import GenerationMixin
from transformers.masking_utils import create_causal_mask
from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
from transformers.modeling_layers import GradientCheckpointingLayer
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from transformers.processing_utils import Unpack
from transformers.utils import TransformersKwargs, logging
from transformers.utils.generic import check_model_inputs
from transformers.utils.import_utils import (
    is_causal_conv1d_available,
    is_flash_linear_attention_available,
)
from .configuration_neollm import NeoLLMConfig


if is_causal_conv1d_available():
    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
else:
    causal_conv1d_update, causal_conv1d_fn = None, None

if is_flash_linear_attention_available():
    from fla.modules import FusedRMSNormGated
    from fla.ops.gated_delta_rule import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule
else:
    chunk_gated_delta_rule, fused_recurrent_gated_delta_rule = None, None
    FusedRMSNormGated = None
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM

logger = logging.get_logger(__name__)


class FANLayer(nn.Module):
    """
    Fourier Analysis Network (FAN) layer for effective periodicity modeling.
    
    From "FANformer: Improving Large Language Models Through Effective Periodicity Modeling":
    FANLayer'(X) = [cos(WpX)||sin(WpX)||(Wp¯X + Bp¯)]
    
    This is the modified version (FANLayer') without activation function that gave 
    the best results in the paper.
    """
    
    def __init__(self, hidden_size: int, fan_ratio: float = 0.25):
        super().__init__()
        self.hidden_size = hidden_size
        self.fan_ratio = fan_ratio
        
        # Calculate dimensions following the paper's approach
        # Output will be: [cos(p) || sin(p) || g] where total = hidden_size + periodic_dim
        output_dim = hidden_size + int(hidden_size * fan_ratio)
        self.p_output_dim = int(output_dim * fan_ratio)
        self.g_output_dim = output_dim - self.p_output_dim * 2
        
        # Single fused projection (more efficient than two separate projections)
        self.input_linear = nn.Linear(
            hidden_size, 
            self.p_output_dim + self.g_output_dim, 
            bias=True
        )
        
        # Initialize parameters
        self._init_weights()
    
    def _init_weights(self):
        """Initialize weights following the paper's recommendations."""
        nn.init.normal_(self.input_linear.weight, mean=0.0, std=0.02)
        if self.input_linear.bias is not None:
            nn.init.zeros_(self.input_linear.bias)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Apply Fourier transformation to input.
        
        Args:
            x: Input tensor of shape (batch, seq_len, hidden_size)
            
        Returns:
            Transformed tensor with Fourier components concatenated
            Shape: (batch, seq_len, hidden_size + periodic_dim)
        """
        # Single projection followed by split (more efficient)
        pg = self.input_linear(x)
        p, g = torch.split(pg, [self.p_output_dim, self.g_output_dim], dim=-1)
        
        # Concatenate all components: [cos(WpX) || sin(WpX) || (Wp¯X + Bp¯)]
        x_fan = torch.cat([torch.cos(p), torch.sin(p), g], dim=-1)
        
        return x_fan


class LNS(nn.Module):
    """
    LayerNorm Scaling (LNS) - applies scaling factor 1/√ℓ as described in the paper.
    
    From "The Curse of Depth in Large Language Models":
    h^(ℓ) = LayerNorm(h^(ℓ)) × (1/√ℓ)
    
    This prevents exponential variance growth in deeper layers.
    """
    def __init__(self, layer_idx: int):
        super().__init__()
        # Layer 1 gets index 1, layer 2 gets index 2, etc.
        # Avoid division by zero for layer 0
        self.layer_idx = max(layer_idx + 1, 1)  # +1 because layer_idx starts from 0
        self.scale = 1.0 / math.sqrt(self.layer_idx)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x * self.scale


class GPAS(nn.Module):
    """
    Gradient-Preserving Activation Scaling (GPAS)
    Scales activations without penalizing gradients using stop-gradient.
    Applied in Pre-Norm style: after sub-layer output but before residual sum.
    """
    def __init__(self, d_model: int):
        super().__init__()
        
        self.d_model = d_model
        self.alpha = nn.Parameter(torch.zeros(1))
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x_detached = x.detach()
        scaled_component = F.silu(self.alpha) * x_detached
        x_scaled = x - scaled_component
        
        return x_scaled


class SeeDNorm(nn.Module):
    """
    Self-Rescaled Dynamic Normalization (SeeDNorm)
    
    From "SeeDNorm: Self-Rescaled Dynamic Normalization":
    SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
    
    Dynamically adjusts the scaling coefficient based on the current input,
    preserving input norm information and enabling data-dependent normalization.
    
    Key features:
    - γ: Static scaling factor (like RMSNorm), initialized to 1
    - β: Self-rescaling parameter, initialized to 0
    - α: Dynamic modulation parameter, initialized to 1
    - σ: tanh activation to constrain dynamic scaling range [-1, 1]
    
    Args:
        dim: Hidden dimension size
        eps: Small constant for numerical stability
    """
    
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.dim = dim
        self.eps = eps
        
        # Learnable parameters
        self.gamma = nn.Parameter(torch.ones(dim))      # γ: static scaling (RMSNorm-like)
        self.beta = nn.Parameter(torch.zeros(dim))      # β: self-rescaling parameter
        self.alpha = nn.Parameter(torch.ones(dim))      # α: dynamic modulation parameter
    
    def _rms_norm(self, x: torch.Tensor) -> torch.Tensor:
        """Compute RMS normalization: x / RMS(x)"""
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Apply Self-Rescaled Dynamic Normalization.
        
        Args:
            x: Input tensor of shape (..., dim)
            
        Returns:
            Normalized and dynamically scaled tensor of same shape
        """
        # Compute input-dependent rescaling: σ(x·β^T)
        # x·β^T produces scalar per token via dot product
        rescale_factor = torch.tanh(torch.sum(x * self.beta, dim=-1, keepdim=True))
        
        # Dynamic scaling coefficient: σ(x·β^T)·α + γ
        dynamic_scale = rescale_factor * self.alpha + self.gamma
        
        # Apply RMS normalization
        x_normalized = self._rms_norm(x.float())
        
        # Apply dynamic scaling
        output = x_normalized * dynamic_scale.float()
        
        return output.type_as(x)
    
    def extra_repr(self) -> str:
        return f"dim={self.dim}, eps={self.eps}"


class NeoLLMRMSNormGated(nn.Module):
    """
    Gated RMSNorm variant used in specific contexts.
    """
    def __init__(self, hidden_size, eps=1e-6, **kwargs):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states, gate=None):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        # Norm before gate
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        hidden_states = self.weight * hidden_states.to(input_dtype)
        hidden_states = hidden_states * F.silu(gate.to(torch.float32))

        return hidden_states.to(input_dtype)


class NeoLLMRotaryEmbedding(nn.Module):
    inv_freq: torch.Tensor  # fix linting for `register_buffer`

    def __init__(self, config: NeoLLMConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
        else:
            self.rope_type = "default"
        self.max_seq_len_cached = config.max_position_embeddings
        self.original_max_seq_len = config.max_position_embeddings

        self.config = config
        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
        self.register_buffer("inv_freq", inv_freq, persistent=False)
        self.original_inv_freq = self.inv_freq

    @torch.no_grad()
    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
    def forward(self, x, position_ids):
        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
        position_ids_expanded = position_ids[:, None, :].float()

        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
            emb = torch.cat((freqs, freqs), dim=-1)
            cos = emb.cos() * self.attention_scaling
            sin = emb.sin() * self.attention_scaling

        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    x1 = x[..., : x.shape[-1] // 2]
    x2 = x[..., x.shape[-1] // 2 :]
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors."""
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)

    # Keep half or full tensor for later concatenation
    rotary_dim = cos.shape[-1]
    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]

    # Apply rotary embeddings on the first half or full tensor
    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)

    # Concatenate back to full shape
    q_embed = torch.cat([q_embed, q_pass], dim=-1)
    k_embed = torch.cat([k_embed, k_pass], dim=-1)
    return q_embed, k_embed


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    **kwargs: Unpack[TransformersKwargs],
):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)

    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value_states)
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, attn_weights


class NeoLLMAttention(nn.Module):
    """
    Multi-headed attention with FANformer integration, SeeDNorm for Q/K normalization,
    and ResFormer feature residual connections for enhanced information flow.
    
    ResFormer enhancement: Applies learnable feature residual connections from the first layer
    BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
    """

    def __init__(self, config: NeoLLMConfig, layer_idx: int):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
        self.scaling = self.head_dim**-0.5
        self.attention_dropout = config.attention_dropout
        self.is_causal = True
        
        # FANformer integration: FAN layer before QKV projections
        self.fan_layer = FANLayer(
            hidden_size=config.hidden_size, 
            fan_ratio=getattr(config, 'fan_ratio', 0.125)
        )
        
        # Calculate the output dimension after FAN transformation
        fan_output_dim = config.hidden_size + int(config.hidden_size * getattr(config, 'fan_ratio', 0.125))
        
        # QKV projections operate on FAN-transformed features
        self.q_proj = nn.Linear(
            fan_output_dim, config.num_attention_heads * self.head_dim * 2, bias=config.attention_bias
        )
        self.k_proj = nn.Linear(
            fan_output_dim, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.v_proj = nn.Linear(
            fan_output_dim, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
        )
        self.o_proj = nn.Linear(
            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
        )
        
        # SeeDNorm for Q/K normalization (replaces RMSNorm)
        self.q_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
        self.k_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
        
        # Dropout for attention output
        self.dropout = nn.Dropout(config.dropout_rate)
        
        # ResFormer: learnable feature residual parameters (initialized to 0.5)
        self.lambda_1 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_1 (first layer features)
        self.lambda_2 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_n (current layer features)

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor],
        first_layer_fan: Optional[torch.Tensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
        input_shape = hidden_states.shape[:-1]
        
        # Apply FANformer transformation first
        hidden_states_fan = self.fan_layer(hidden_states)
        
        # ResFormer: Apply feature residual connection BEFORE projections
        # This ensures dimensional compatibility across all layer types
        if first_layer_fan is not None:
            hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
        
        # Store current FAN features for potential use as first_layer_fan in subsequent layers
        current_layer_fan = hidden_states_fan.clone()
        
        hidden_shape = (*input_shape, -1, self.head_dim)

        # Use FAN-transformed features (with residual applied) for projections
        query_states, gate = torch.chunk(
            self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
        )
        gate = gate.reshape(*input_shape, -1)

        # Apply SeeDNorm to Q and K
        query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
        key_states = self.k_norm(self.k_proj(hidden_states_fan).view(hidden_shape)).transpose(1, 2)
        value_states = self.v_proj(hidden_states_fan).view(hidden_shape).transpose(1, 2)

        cos, sin = position_embeddings
        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            dropout=0.0 if not self.training else self.attention_dropout,
            scaling=self.scaling,
            **kwargs,
        )

        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
        attn_output = attn_output * torch.sigmoid(gate)

        attn_output = self.o_proj(attn_output)
        attn_output = self.dropout(attn_output)
        
        return attn_output, attn_weights, current_layer_fan


def apply_mask_to_padding_states(hidden_states, attention_mask):
    """
    Tunes out the hidden states for padding tokens
    """
    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
        dtype = hidden_states.dtype
        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)

    return hidden_states


is_fast_path_available = all(
    (causal_conv1d_fn, causal_conv1d_update, chunk_gated_delta_rule, fused_recurrent_gated_delta_rule)
)


def torch_causal_conv1d_update(
    hidden_states,
    conv_state,
    weight,
    bias=None,
    activation=None,
):
    _, hidden_size, seq_len = hidden_states.shape
    state_len = conv_state.shape[-1]

    hidden_states_new = torch.cat([conv_state, hidden_states], dim=-1).to(weight.dtype)
    conv_state.copy_(hidden_states_new[:, :, -state_len:])
    out = F.conv1d(hidden_states_new, weight.unsqueeze(1), bias, padding=0, groups=hidden_size)
    out = F.silu(out[:, :, -seq_len:])
    out = out.to(hidden_states.dtype)
    return out


def l2norm(x: torch.FloatTensor, dim: int = -1, eps: float = 1e-6):
    """This function is intended to align with the l2norm implementation in the FLA library."""
    inv_norm = 1 / torch.sqrt((x * x).sum(dim=dim, keepdim=True) + eps)
    return x * inv_norm


def torch_chunk_gated_delta_rule(
    query,
    key,
    value,
    g,
    beta,
    chunk_size=64,
    initial_state=None,
    output_final_state=False,
    use_qk_l2norm_in_kernel=False,
):
    initial_dtype = query.dtype
    if use_qk_l2norm_in_kernel:
        query = l2norm(query, dim=-1, eps=1e-6)
        key = l2norm(key, dim=-1, eps=1e-6)
    query, key, value, beta, g = [
        x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)
    ]

    batch_size, sequence_length, num_heads, k_head_dim = key.shape
    v_head_dim = value.shape[-1]
    pad_size = (chunk_size - num_heads % chunk_size) % chunk_size
    query = F.pad(query, (0, 0, 0, pad_size))
    key = F.pad(key, (0, 0, 0, pad_size))
    value = F.pad(value, (0, 0, 0, pad_size))
    beta = F.pad(beta, (0, pad_size))
    g = F.pad(g, (0, pad_size))
    tot_heads = num_heads + pad_size
    scale = 1 / (query.shape[-1] ** 0.5)
    query = query * scale

    v_beta = value * beta.unsqueeze(-1)
    k_beta = key * beta.unsqueeze(-1)
    # reshape to chunks
    query, key, value, k_beta, v_beta = [
        x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1]) for x in (query, key, value, k_beta, v_beta)
    ]
    g = g.reshape(g.shape[0], g.shape[1], -1, chunk_size)
    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=0)

    # chunk decay
    g = g.cumsum(dim=-1)
    decay_mask = ((g.unsqueeze(-1) - g.unsqueeze(-2)).tril().exp().float()).tril()
    attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
    for i in range(1, chunk_size):
        row = attn[..., i, :i].clone()
        sub = attn[..., :i, :i].clone()
        attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
    attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
    value = attn @ v_beta
    k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
    last_recurrent_state = (
        torch.zeros(batch_size, sequence_length, k_head_dim, v_head_dim).to(value)
        if initial_state is None
        else initial_state.to(value)
    )
    core_attn_out = torch.zeros_like(value)
    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=1)

    # for each chunk
    for i in range(0, tot_heads // chunk_size):
        q_i, k_i, v_i = query[:, :, i], key[:, :, i], value[:, :, i]
        attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
        v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
        v_new = v_i - v_prime
        attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
        core_attn_out[:, :, i] = attn_inter + attn @ v_new
        last_recurrent_state = (
            last_recurrent_state * g[:, :, i, -1, None, None].exp()
            + (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new
        )

    if not output_final_state:
        last_recurrent_state = None
    core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1, core_attn_out.shape[-1])
    core_attn_out = core_attn_out[:, :, :num_heads]
    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
    return core_attn_out, last_recurrent_state


def torch_recurrent_gated_delta_rule(
    query, key, value, g, beta, initial_state, output_final_state, use_qk_l2norm_in_kernel=False
):
    initial_dtype = query.dtype
    if use_qk_l2norm_in_kernel:
        query = l2norm(query, dim=-1, eps=1e-6)
        key = l2norm(key, dim=-1, eps=1e-6)
    query, key, value, beta, g = [
        x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)
    ]

    batch_size, sequence_length, num_heads, k_head_dim = key.shape
    v_head_dim = value.shape[-1]
    scale = 1 / (query.shape[-1] ** 0.5)
    query = query * scale

    core_attn_out = torch.zeros(batch_size, sequence_length, num_heads, v_head_dim).to(value)
    last_recurrent_state = (
        torch.zeros(batch_size, sequence_length, k_head_dim, v_head_dim).to(value)
        if initial_state is None
        else initial_state.to(value)
    )

    for i in range(num_heads):
        q_t = query[:, :, i]
        k_t = key[:, :, i]
        v_t = value[:, :, i]
        g_t = g[:, :, i].exp().unsqueeze(-1).unsqueeze(-1)
        beta_t = beta[:, :, i].unsqueeze(-1)

        last_recurrent_state = last_recurrent_state * g_t
        kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
        delta = (v_t - kv_mem) * beta_t
        last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2)
        core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)

    if not output_final_state:
        last_recurrent_state = None
    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
    return core_attn_out, last_recurrent_state


class NeoLLMGatedDeltaNet(nn.Module):
    """
    Linear attention with FANformer integration, SeeDNorm for normalization,
    and ResFormer feature residual connections for enhanced information flow.
    
    ResFormer enhancement: Applies learnable feature residual connections from the first layer
    BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
    """
    
    def __init__(self, config: NeoLLMConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.num_v_heads = config.linear_num_value_heads
        self.num_k_heads = config.linear_num_key_heads
        self.head_k_dim = config.linear_key_head_dim
        self.head_v_dim = config.linear_value_head_dim
        self.key_dim = self.head_k_dim * self.num_k_heads
        self.value_dim = self.head_v_dim * self.num_v_heads

        self.conv_kernel_size = config.linear_conv_kernel_dim
        self.layer_idx = layer_idx
        self.activation = config.hidden_act
        self.act = ACT2FN[config.hidden_act]
        self.layer_norm_epsilon = config.rms_norm_eps

        # FANformer integration: FAN layer before projections
        self.fan_layer = FANLayer(
            hidden_size=config.hidden_size, 
            fan_ratio=getattr(config, 'fan_ratio', 0.125)
        )
        
        # Calculate the output dimension after FAN transformation
        fan_output_dim = config.hidden_size + int(config.hidden_size * getattr(config, 'fan_ratio', 0.125))

        # QKV - operates on FAN-transformed features
        self.conv_dim = self.key_dim * 2 + self.value_dim
        self.conv1d = nn.Conv1d(
            in_channels=self.conv_dim,
            out_channels=self.conv_dim,
            bias=False,
            kernel_size=self.conv_kernel_size,
            groups=self.conv_dim,
            padding=self.conv_kernel_size - 1,
        )

        # projection of the FAN-transformed hidden states
        projection_size_qkvz = self.key_dim * 2 + self.value_dim * 2
        projection_size_ba = self.num_v_heads * 2
        self.in_proj_qkvz = nn.Linear(fan_output_dim, projection_size_qkvz, bias=False)
        self.in_proj_ba = nn.Linear(fan_output_dim, projection_size_ba, bias=False)

        # time step projection (discretization)
        self.dt_bias = nn.Parameter(torch.ones(self.num_v_heads))

        A = torch.empty(self.num_v_heads).uniform_(0, 16)
        self.A_log = nn.Parameter(torch.log(A))

        # FLA compatibility: use "silu" for FusedRMSNormGated, original activation elsewhere
        fla_compatible_activation = "silu" if self.activation not in ['swish', 'silu', 'sigmoid'] else self.activation
        
        self.norm = (
            NeoLLMRMSNormGated(self.head_v_dim, eps=self.layer_norm_epsilon)
            if FusedRMSNormGated is None
            else FusedRMSNormGated(
                self.head_v_dim,
                eps=self.layer_norm_epsilon,
                activation=fla_compatible_activation,
                device=torch.cuda.current_device(),
                dtype=config.dtype if config.dtype is not None else torch.get_default_dtype(),
            )
        )

        self.out_proj = nn.Linear(self.value_dim, self.hidden_size, bias=False)
        
        # Dropout for attention output
        self.dropout = nn.Dropout(config.dropout_rate)

        self.causal_conv1d_fn = causal_conv1d_fn
        self.causal_conv1d_update = causal_conv1d_update or torch_causal_conv1d_update
        self.chunk_gated_delta_rule = chunk_gated_delta_rule or torch_chunk_gated_delta_rule
        self.recurrent_gated_delta_rule = fused_recurrent_gated_delta_rule or torch_recurrent_gated_delta_rule

        # ResFormer: learnable feature residual parameters (initialized to 0.5)
        self.lambda_1 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_1 (first layer features)
        self.lambda_2 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_n (current layer features)

        if not is_fast_path_available:
            logger.warning_once(
                "The fast path is not available because one of the required library is not installed. Falling back to "
                "torch implementation. To install follow https://github.com/fla-org/flash-linear-attention#installation and"
                " https://github.com/Dao-AILab/causal-conv1d"
            )

    def fix_query_key_value_ordering(self, mixed_qkvz, mixed_ba):
        """
        Derives `query`, `key` and `value` tensors from `mixed_qkvz` and `mixed_ba`.
        """
        new_tensor_shape_qkvz = mixed_qkvz.size()[:-1] + (
            self.num_k_heads,
            2 * self.head_k_dim + 2 * self.head_v_dim * self.num_v_heads // self.num_k_heads,
        )
        new_tensor_shape_ba = mixed_ba.size()[:-1] + (self.num_k_heads, 2 * self.num_v_heads // self.num_k_heads)

        mixed_qkvz = mixed_qkvz.view(*new_tensor_shape_qkvz)
        mixed_ba = mixed_ba.view(*new_tensor_shape_ba)
        split_arg_list_qkvz = [
            self.head_k_dim,
            self.head_k_dim,
            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
        ]
        split_arg_list_ba = [self.num_v_heads // self.num_k_heads, self.num_v_heads // self.num_k_heads]
        query, key, value, z = torch.split(mixed_qkvz, split_arg_list_qkvz, dim=3)
        b, a = torch.split(mixed_ba, split_arg_list_ba, dim=3)
        # [b, sq, ng, np/ng * hn] -> [b, sq, np, hn]
        value = value.reshape(value.size(0), value.size(1), -1, self.head_v_dim)
        z = z.reshape(z.size(0), z.size(1), -1, self.head_v_dim)
        b = b.reshape(b.size(0), b.size(1), self.num_v_heads)
        a = a.reshape(a.size(0), a.size(1), self.num_v_heads)
        return query, key, value, z, b, a

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        first_layer_fan: Optional[torch.Tensor] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)

        # Set up dimensions for reshapes later
        batch_size, seq_len, _ = hidden_states.shape

        # Apply FANformer transformation first
        hidden_states_fan = self.fan_layer(hidden_states)
        
        # ResFormer: Apply feature residual connection BEFORE projections
        # This ensures dimensional compatibility across all layer types
        if first_layer_fan is not None:
            hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
        
        # Store current FAN features for potential use as first_layer_fan in subsequent layers
        current_layer_fan = hidden_states_fan.clone()
        
        # Use FAN-transformed features (with residual applied) for projections
        projected_states_qkvz = self.in_proj_qkvz(hidden_states_fan)
        projected_states_ba = self.in_proj_ba(hidden_states_fan)
        query, key, value, z, b, a = self.fix_query_key_value_ordering(projected_states_qkvz, projected_states_ba)
        query, key, value = (x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value))

        mixed_qkv = torch.cat((query, key, value), dim=-1)
        mixed_qkv = mixed_qkv.transpose(1, 2)

        # Simple convolution without cache
        if self.causal_conv1d_fn is not None:
            mixed_qkv = self.causal_conv1d_fn(
                x=mixed_qkv,
                weight=self.conv1d.weight.squeeze(1),
                bias=self.conv1d.bias,
                activation="silu",  # Keep original activation for conv1d
                seq_idx=None,
            )
        else:
            mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])

        mixed_qkv = mixed_qkv.transpose(1, 2)
        query, key, value = torch.split(
            mixed_qkv,
            [
                self.key_dim,
                self.key_dim,
                self.value_dim,
            ],
            dim=-1,
        )
        query = query.reshape(query.shape[0], query.shape[1], -1, self.head_k_dim)
        key = key.reshape(key.shape[0], key.shape[1], -1, self.head_k_dim)
        value = value.reshape(value.shape[0], value.shape[1], -1, self.head_v_dim)

        beta = b.sigmoid()
        # If the model is loaded in fp16, without the .float() here, A might be -inf
        g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
        if self.num_v_heads // self.num_k_heads > 1:
            query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
            key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)

        # Use chunk-based implementation without cache
        core_attn_out, _ = self.chunk_gated_delta_rule(
            query,
            key,
            value,
            g=g,
            beta=beta,
            initial_state=None,
            output_final_state=False,
            use_qk_l2norm_in_kernel=True,
        )

        z_shape_og = z.shape
        # reshape input data into 2D tensor
        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
        z = z.reshape(-1, z.shape[-1])
        core_attn_out = self.norm(core_attn_out, z)
        core_attn_out = core_attn_out.reshape(z_shape_og)
        core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1)

        output = self.out_proj(core_attn_out)
        output = self.dropout(output)  # Apply dropout after output projection
        
        return output, current_layer_fan


class PolyNorm(torch.nn.Module):
    def __init__(self, eps=1e-6):
        super(PolyNorm, self).__init__()
        self.weight = torch.nn.Parameter(torch.ones(3) / 3)
        self.bias = torch.nn.Parameter(torch.zeros(1))
        self.eps = eps

    def _norm(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        return self.weight[0] * self._norm(x**3) + self.weight[1] * self._norm(x**2) + self.weight[2] * self._norm(x) + self.bias


class NeoLLMMLP(nn.Module):
    """
    MLP with FANformer integration for featural periodicity modeling.
    
    This captures periodicities in the feature space (semantic/embedding dimensions)
    complementary to the relational periodicities captured by attention mechanisms.
    Works in conjunction with ResFormer for comprehensive information flow.
    """
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.intermediate_size = config.intermediate_size
        
        # NEW: FANformer integration for featural space periodicity
        self.fan_layer = FANLayer(
            hidden_size=config.hidden_size,
            fan_ratio=getattr(config, 'fan_ratio_ffn', 0.0625)  # Half of attention's fan_ratio
        )
        
        # Calculate the output dimension after FAN transformation
        fan_output_dim = config.hidden_size + int(config.hidden_size * getattr(config, 'fan_ratio_ffn', 0.0625))
        
        # SwiGLU/Gated architecture - now operates on FAN-transformed features
        self.gate_proj = nn.Linear(fan_output_dim, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(fan_output_dim, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = PolyNorm()
        
        # Dropout for MLP hidden layer
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, x):
        # NEW: Apply FAN transformation before projections
        x_fan = self.fan_layer(x)
        
        # Use FAN-transformed features for gate and up projections
        gate_output = self.act_fn(self.gate_proj(x_fan))
        up_output = self.up_proj(x_fan)
        hidden = gate_output * up_output
        hidden = self.dropout(hidden)
        return self.down_proj(hidden)


class NeoLLMDecoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: NeoLLMConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.layer_idx = layer_idx

        # token mixer
        self.layer_type = config.layer_types[layer_idx]
        if self.layer_type == "linear_attention":
            self.linear_attn = NeoLLMGatedDeltaNet(config, layer_idx)
        elif self.layer_type == "full_attention":
            self.self_attn = NeoLLMAttention(config, layer_idx)

        # MLP with FANformer integration
        self.mlp = NeoLLMMLP(config)

        # SeeDNorm for input and post-attention normalization (replaces RMSNorm)
        self.input_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
        
        # LNS (LayerNorm Scaling) - applies 1/√ℓ scaling
        self.lns_attn = LNS(layer_idx)
        self.lns_mlp = LNS(layer_idx)
        
        # GPAS (Gradient-Preserving Activation Scaling) - applied after residual connections
        self.gpas_attn = GPAS(config.hidden_size)
        self.gpas_mlp = GPAS(config.hidden_size)
        
        # ResFormer: storage for current layer's FAN features
        self.current_layer_fan = None

    def forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        first_layer_fan: Optional[torch.Tensor] = None,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> torch.FloatTensor:
        residual = hidden_states

        # Apply SeeDNorm normalization
        hidden_states = self.input_layernorm(hidden_states)
        
        # Apply LNS scaling after normalization
        hidden_states = self.lns_attn(hidden_states)

        # Token Mixer with ResFormer feature residual connections
        if self.layer_type == "linear_attention":
            hidden_states, self.current_layer_fan = self.linear_attn(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                first_layer_fan=first_layer_fan,
            )
        elif self.layer_type == "full_attention":
            # Self Attention
            hidden_states, _, self.current_layer_fan = self.self_attn(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                position_embeddings=position_embeddings,
                first_layer_fan=first_layer_fan,
                **kwargs,
            )

        # Standard residual connection
        hidden_states = residual + hidden_states
        
        # Apply GPAS after attention residual connection
        hidden_states = self.gpas_attn(hidden_states)

        # Fully Connected with FANformer
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        
        # Apply LNS scaling after normalization
        hidden_states = self.lns_mlp(hidden_states)
        
        # MLP now includes FAN transformation internally
        hidden_states = self.mlp(hidden_states)
        
        # Standard residual connection
        hidden_states = residual + hidden_states
        
        # Apply GPAS after MLP residual connection
        hidden_states = self.gpas_mlp(hidden_states)

        return hidden_states


class NeoLLMPreTrainedModel(PreTrainedModel):
    config: NeoLLMConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["NeoLLMDecoderLayer"]
    _supports_flash_attn_2 = True
    _supports_sdpa = True
    _is_stateful = True

    def _init_weights(self, module):
        super()._init_weights(module)
        if isinstance(module, NeoLLMGatedDeltaNet):
            module.dt_bias.data.fill_(1.0)
            module.A_log.data.uniform_(0, 16).log_()
            # ResFormer: initialize lambda parameters for linear attention
            if hasattr(module, 'lambda_1'):
                module.lambda_1.data.fill_(0.5)
            if hasattr(module, 'lambda_2'):
                module.lambda_2.data.fill_(0.5)
        elif isinstance(module, NeoLLMAttention):
            # ResFormer: initialize lambda parameters for full attention
            if hasattr(module, 'lambda_1'):
                module.lambda_1.data.fill_(0.5)
            if hasattr(module, 'lambda_2'):
                module.lambda_2.data.fill_(0.5)
        elif isinstance(module, GPAS):
            # Initialize GPAS alpha to 0 as per paper
            module.alpha.data.fill_(0.0)
        elif isinstance(module, FANLayer):
            # FANLayer initialization is handled within the class
            pass
        elif isinstance(module, SeeDNorm):
            # SeeDNorm initialization:
            # gamma (γ) initialized to 1 (default in Parameter definition)
            # beta (β) initialized to 0 (default in Parameter definition)
            # alpha (α) initialized to 1 (default in Parameter definition)
            pass


class NeoLLMModel(NeoLLMPreTrainedModel):
    def __init__(self, config: NeoLLMConfig):
        super().__init__(config)
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
        
        # Each layer creates its own components (no shared parameters)
        self.layers = nn.ModuleList(
            [NeoLLMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        # SeeDNorm for final output normalization (replaces RMSNorm)
        self.norm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
        self.gradient_checkpointing = False
        
        # ResFormer: storage for first layer's FAN features (H_fan_1)
        self.first_layer_fan = None
        
        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPast:
        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if position_ids is None:
            position_ids = torch.arange(0, inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0)

        causal_mask = create_causal_mask(
            config=self.config,
            input_embeds=inputs_embeds,
            attention_mask=attention_mask,
            cache_position=position_ids.squeeze(0),
            past_key_values=None,
            position_ids=position_ids,
        )
        linear_attn_mask = self._update_linear_attn_mask(attention_mask, position_ids.squeeze(0))

        hidden_states = inputs_embeds

        # create position embeddings to be shared across the decoder layers
        position_embeddings = self.rotary_emb(hidden_states, position_ids)

        # ResFormer: reset first_layer_fan at the start of each forward pass
        self.first_layer_fan = None

        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
            layer_mask = linear_attn_mask if decoder_layer.layer_type == "linear_attention" else causal_mask

            hidden_states = decoder_layer(
                hidden_states,
                position_embeddings=position_embeddings,
                attention_mask=layer_mask,
                first_layer_fan=self.first_layer_fan,  # Pass H_fan_1 to all layers
                **kwargs,
            )
            
            # ResFormer: capture H_fan_1 from the first layer
            if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
                self.first_layer_fan = decoder_layer.current_layer_fan

        # Apply SeeDNorm for final normalization
        hidden_states = self.norm(hidden_states)

        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=None,
        )

    def _update_linear_attn_mask(self, attention_mask, cache_position):
        """
        NOTE: Left-padding is used for linear attention mask.
        No need for zeroing states when attending to all inputs
        """
        linear_attn_mask = attention_mask
        if attention_mask is not None and torch.all(attention_mask == 1):
            linear_attn_mask = None
        return linear_attn_mask


@torch.compiler.disable
def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, pad_token_id=None):
    """
    CCE loss computation excluded from compilation.
    Preprocesses labels to eliminate torch.compile warnings.
    """
    # Ensure labels are on the correct device
    processed_labels = labels.to(hidden_states.device)
    
    # Handle pad tokens: convert pad_token_id to -100 for proper masking
    if pad_token_id is not None:
        processed_labels = torch.where(
            processed_labels == pad_token_id,
            torch.tensor(-100, dtype=processed_labels.dtype, device=processed_labels.device),
            processed_labels
        )
    
    return linear_cross_entropy(
        hidden_states,
        lm_head_weight,
        processed_labels,
        bias=lm_head_bias,
        shift=1,
        impl="cce_kahan_full_c",
        reduction="mean"
    )


class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
    
    def __init__(self, config):
        super().__init__(config)
        self.model = NeoLLMModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        logits_to_keep: Union[int, torch.Tensor] = 0,
        **kwargs: Unpack[TransformersKwargs],
    ) -> CausalLMOutputWithPast:
        outputs: BaseModelOutputWithPast = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            **kwargs,
        )
        
        hidden_states = outputs.last_hidden_state
        
        # CCE Loss computation for training
        if labels is not None:
            loss = compute_cce_loss(
                hidden_states, 
                labels, 
                self.lm_head.weight,
                getattr(self.lm_head, 'bias', None),
                self.config.pad_token_id
            )
            logits = None
        else:
            # Inference mode - compute logits normally
            slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
            logits = self.lm_head(hidden_states[:, slice_indices, :])
            loss = None
        
        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=None,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


# ==================== AUTOMODEL REGISTRATION ====================

__all__ = [
    "NeoLLMForCausalLM",
    "NeoLLMModel",
    "NeoLLMPreTrainedModel",
    "NeoLLMConfig",
    "FANLayer",
    "SeeDNorm",
]

# Register the configuration and model for AutoClass support
AutoConfig.register("neollm", NeoLLMConfig)
AutoModel.register(NeoLLMConfig, NeoLLMModel)
AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)