Spaces:

thejagstudio
/

NanoDiffusion

Sleeping

App Files Files Community

thejagstudio commited on 25 days ago

Commit

94f7913

1 Parent(s): 719d04f

Upload 8 files

Browse files

Files changed (7) hide show

README.md +12 -10
app-0.py +467 -0
app.py +798 -0
final_model.pth +3 -0
meta.pkl +3 -0
model_epoch_1.pth +3 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,10 +1,12 @@
----
-title: NanoDiffusion
-emoji: 🏃
-colorFrom: yellow
-colorTo: green
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Diffusion GPT
+emoji: 🖊️📖🌀
+colorFrom: gray
+colorTo: blue
+sdk: gradio
+sdk_version: 5.49.1
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app-0.py ADDED Viewed

	@@ -0,0 +1,467 @@

+import gradio as gr
+import spaces
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import numpy as np
+import math
+import os
+import pickle
+import requests
+import textwrap
+import subprocess
+import shutil
+import time
+from dataclasses import dataclass
+from typing import Optional
+# --- 1. Automated Environment and Data Setup ---
+def setup_environment():
+    """
+    Checks for and sets up the necessary data and code.
+    - Clones nanoGPT if not present.
+    - Copies the shakespeare_char dataset directory.
+    - Runs the data preparation script to create meta.pkl and binary files.
+    This function makes the script self-contained.
+    """
+    nano_gpt_repo_path = 'nanoGPT'
+    data_dir_path = 'shakespeare_char'
+    meta_path = os.path.join(data_dir_path, 'meta.pkl')
+    if os.path.exists(meta_path):
+        print("Dataset and metadata found. Skipping setup.")
+        return
+    print("Required data not found. Starting one-time setup...")
+    if not os.path.exists(nano_gpt_repo_path):
+        print(f"Cloning nanoGPT repository...")
+        try:
+            subprocess.run(
+                ['git', 'clone', 'https://github.com/karpathy/nanoGPT.git'],
+                check=True, capture_output=True, text=True
+            )
+            print("Cloned successfully.")
+        except subprocess.CalledProcessError as e:
+            print(f"Error cloning repository: {e.stderr}")
+            raise
+    else:
+        print("nanoGPT repository already exists.")
+    source_data_dir = os.path.join(nano_gpt_repo_path, 'data', 'shakespeare_char')
+    if not os.path.exists(data_dir_path):
+        print(f"Copying '{source_data_dir}' to '{data_dir_path}'...")
+        shutil.copytree(source_data_dir, data_dir_path)
+        print("Copied successfully.")
+    else:
+        print(f"'{data_dir_path}' directory already exists.")
+    prepare_script_path = os.path.join(data_dir_path, 'prepare.py')
+    if not os.path.exists(meta_path):
+        print(f"Running data preparation script: '{prepare_script_path}'...")
+        try:
+            subprocess.run(
+                ['python', 'prepare.py'],
+                check=True, cwd=data_dir_path, capture_output=True, text=True
+            )
+            print("Data preparation script finished successfully.")
+        except subprocess.CalledProcessError as e:
+            print(f"Error running prepare.py: {e.stderr}")
+            raise
+    print("Setup complete.")
+setup_environment()
+# --- 2. Global Setup & Helper Functions ---
+data_dir = './shakespeare_char/'
+def download_file(url, filename):
+    """Downloads a file from a URL if it doesn't exist."""
+    if os.path.exists(filename):
+        print(f"'{filename}' already exists. Skipping download.")
+        return
+    print(f"Downloading '{filename}' from '{url}'...")
+    try:
+        response = requests.get(url, stream=True)
+        response.raise_for_status() # Check for download errors
+        with open(filename, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        print("Download complete.")
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading {url}: {e}")
+        raise
+# Define file URLs and local paths
+meta_url = 'https://huggingface.co/spaces/thejagstudio/diffusion-gpt/resolve/main/meta.pkl'
+meta_path = 'meta.pkl'
+download_file(meta_url, meta_path)
+with open(meta_path, 'rb') as f:
+    meta = pickle.load(f)
+vocab_size = meta['vocab_size']
+itos = meta['itos']
+stoi = meta['stoi']
+context_length = 256
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+def decode(indices_tensor: torch.Tensor):
+    if indices_tensor.dim() > 1:
+        indices_tensor = indices_tensor.squeeze(0)
+    indices = indices_tensor.cpu().numpy()
+    return ''.join([itos.get(i, '?') for i in indices])
+def wrap_text(long_text, width=80):
+    paragraphs = long_text.splitlines()
+    wrapped = [textwrap.fill(p, width=width) if p else '' for p in paragraphs]
+    return "\n".join(wrapped)
+# --- 3. Model Architecture (Identical to Notebook) ---
+@dataclass
+class GPTConfig:
+    block_size: int = 1024
+    vocab_size: int = 50304
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    cond_dim: int = 64
+    dropout: float = 0.0
+    bias: bool = False
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu    = nn.GELU()
+        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class SelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+    def forward(self, x):
+        B, T, C = x.size()
+        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        if self.flash:
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=False)
+        else:
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return x * (1 + scale) + shift
+def bias_add_scale(x: torch.Tensor, bias: Optional[torch.Tensor], scale: torch.Tensor, residual: Optional[torch.Tensor]) -> torch.Tensor:
+    if bias is not None:
+        out = scale * (x + bias)
+    else:
+        out = scale * x
+    if residual is not None:
+        out = residual + out
+    return out
+class DDiTBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = SelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = MLP(config)
+        self.adaLN_modulation = nn.Linear(config.cond_dim, 6 * config.n_embd)
+        self.adaLN_modulation.weight.data.zero_()
+        self.adaLN_modulation.bias.data.zero_()
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c)[:, None].chunk(6, dim=2)
+        x_skip = x
+        x = modulate(self.ln_1(x), shift_msa, scale_msa)
+        x = self.attn(x)
+        x = bias_add_scale(self.attn(self.ln_1(x)), None, gate_msa, x_skip)
+        x = bias_add_scale(self.mlp(modulate(self.ln_2(x), shift_mlp, scale_mlp)), None, gate_mlp, x)
+        return x
+class DDitFinalLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(config.n_embd, bias=config.bias)
+        self.linear = nn.Linear(config.n_embd, config.vocab_size)
+        self.linear.weight.data.zero_()
+        self.linear.bias.data.zero_()
+        self.adaLN_modulation = nn.Linear(config.cond_dim, 2 * config.n_embd)
+        self.adaLN_modulation.weight.data.zero_()
+        self.adaLN_modulation.bias.data.zero_()
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c)[:, None].chunk(2, dim=2)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+        self.sigma_map = TimestepEmbedder(config.cond_dim)
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            wpe = nn.Embedding(config.block_size, config.n_embd),
+            drop = nn.Dropout(config.dropout),
+            h = nn.ModuleList([DDiTBlock(config) for _ in range(config.n_layer)]),
+            ln_f = nn.LayerNorm(config.n_embd, bias=config.bias),
+        ))
+        self.lm_head = DDitFinalLayer(config)
+        self.apply(self._init_weights)
+        for pn, p in self.named_parameters():
+            if pn.endswith('c_proj.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, sigma):
+        sigma = sigma.reshape(-1)
+        b, t = idx.size()
+        c = F.silu(self.sigma_map(sigma))
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        pos = torch.arange(0, t, dtype=torch.long, device=device)
+        tok_emb = self.transformer.wte(idx)
+        pos_emb = self.transformer.wpe(pos)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        for block in self.transformer.h:
+            x = block(x, c)
+        x = self.transformer.ln_f(x)
+        x = self.lm_head(x, c)
+        x = torch.scatter(x, -1, idx[..., None], torch.zeros_like(x[..., :1]))
+        return x
+class GeometricNoise:
+    def __init__(self, sigma_min=1e-4, sigma_max=20):
+        self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max]).to(device)
+    def rate_noise(self, t):
+        return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log())
+    def total_noise(self, t):
+        return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
+    def __call__(self, t):
+        return self.total_noise(t), self.rate_noise(t)
+# --- 4. Inference & Sampling Logic (Identical to Notebook) ---
+def transition(x_t: torch.Tensor, delta_sigma: torch.Tensor) -> torch.Tensor:
+    base_prob = (1 - torch.exp(-delta_sigma[..., None])) / vocab_size
+    trans = torch.ones(*x_t.shape, vocab_size, device=x_t.device) * base_prob
+    trans = trans.scatter(-1, x_t[..., None], torch.zeros_like(trans))
+    diag_fill = 1 - trans.sum(dim=-1, keepdim=True)
+    trans = trans.scatter(-1, x_t[..., None], diag_fill)
+    return trans
+def staggered_score(score, delta_sigma):
+    exp_factor = torch.exp(-delta_sigma)[..., None]
+    correction = ((exp_factor - 1) / (vocab_size * exp_factor)) * score.sum(dim=-1, keepdim=True)
+    return correction + score / exp_factor
+def sample_categorical(probs: torch.Tensor) -> torch.Tensor:
+    eps = 1e-10
+    gumbel_noise = -torch.log(-torch.log(torch.rand_like(probs) + eps) + eps)
+    return torch.argmax(torch.log(probs + eps) + gumbel_noise, dim=-1)
+# --- 5. Model Initialization and Loading ---
+print("Initializing and loading the pretrained model...")
+model_args = dict(n_layer=6, n_head=6, n_embd=384, cond_dim=64,
+                  bias=False, vocab_size=vocab_size, block_size=context_length, dropout=0.2)
+config = GPTConfig(**model_args)
+model = GPT(config)
+model.load_state_dict(
+    torch.hub.load_state_dict_from_url(
+        'https://huggingface.co/spaces/thejagstudio/diffusion-gpt/resolve/main/final_model.pth?download=true',
+        # 'https://huggingface.co/spaces/thejagstudio/diffusion-gpt/resolve/main/model_epoch_1.pth?download=true',
+        map_location=device
+    )
+)
+model.to(device)
+model.eval()
+noise = GeometricNoise(sigma_min=1e-4, sigma_max=20)
+print("Model loaded successfully.")
+# --- 6. Gradio Interface Logic ---
+@spaces.GPU
+def generate_text(steps):
+    """
+    Fast generation phase. Runs the diffusion process and stores all
+    intermediate frames in a list, then returns the final text and the list.
+    """
+    steps = int(steps)
+    eps = 1e-5
+    # List to store each frame of the diffusion process
+    diffusion_frames = []
+    # Start with a random sample
+    x = torch.randint(0, vocab_size, (1, context_length), device=device)
+    initial_text = f"--- Initial Random Noise ---\n\n{wrap_text(decode(x[0]))}"
+    diffusion_frames.append(initial_text)
+    timesteps = torch.linspace(1, eps, steps + 1, device=device)
+    step_size = (1 - eps) / steps
+    with torch.no_grad():
+        for i in range(steps):
+            t = timesteps[i] * torch.ones(x.shape[0], 1, device=device)
+            curr_sigma_bar = noise(t)[0]
+            next_sigma_bar = noise(t - step_size)[0]
+            delta_sigma = curr_sigma_bar - next_sigma_bar
+            log_score = model(x, curr_sigma_bar)
+            score = torch.exp(log_score)
+            stag_score = staggered_score(score, delta_sigma)
+            probs = stag_score * transition(x, delta_sigma)
+            x = sample_categorical(probs)
+            # Store the frame
+            progress_text = f"--- Denoising Step {i + 1}/{steps} ---\n\n{wrap_text(decode(x[0]))}"
+            diffusion_frames.append(progress_text)
+        # Final denoising step
+        t = timesteps[steps] * torch.ones(x.shape[0], 1, device=device)
+        curr_sigma_bar = noise(t)[0]
+        delta_sigma = curr_sigma_bar
+        log_score = model(x, curr_sigma_bar)
+        score = torch.exp(log_score)
+        stag_score = staggered_score(score, delta_sigma)
+        probs = stag_score * transition(x, delta_sigma)
+        x = sample_categorical(probs)
+    final_text = f"--- Final Denoised Text (Step {steps}) ---\n\n{wrap_text(decode(x[0]))}"
+    diffusion_frames.append(final_text)
+    # Return the final text and the complete list of frames
+    return final_text, diffusion_frames
+def replay_diffusion(frames, replay_speed):
+    """
+    Slow replay phase. Iterates through the stored frames and yields them
+    with a delay to create an animation effect.
+    """
+    delay = 0.5 / replay_speed # Calculate delay based on speed multiplier
+    for frame in frames:
+        yield frame
+        time.sleep(delay)
+# Define the Gradio UI
+css = '''.gradio-container > .fillable {max-width: 720px !important}
+h3{margin-top: 1em}
+p{margin-top: 0}
+textarea{font-family: monospace;background-color: black}
+'''
+with gr.Blocks(theme=gr.themes.Citrus(), css=css) as demo:
+    gr.Markdown(
+        """
+        # LLADA inspired diffusion language model
+        ### A Tiny 11M parameters character based diffusion model
+        """
+    )
+    generate_button = gr.Button("Generate", variant="primary")
+    output_textbox = gr.Textbox(
+        label="Generated Text",
+        lines=15,
+        interactive=False,
+        show_copy_button=True,
+        placeholder="Generation will appear here..."
+    )
+    with gr.Row():
+        steps_slider = gr.Slider(
+            minimum=64,
+            maximum=512,
+            value=128,
+            step=1,
+            label="Denoising Steps",
+            info="Number of steps in the generation process."
+        )
+        speed_slider = gr.Slider(
+            minimum=1,
+            maximum=20,
+            value=10,
+            step=1,
+            label="Replay Speed",
+            info="Controls the speed of the animation after generation.",
+            visible=False
+        )
+    diffusion_frames_state = gr.State([])
+    generate_event = generate_button.click(
+        fn=generate_text,
+        inputs=[steps_slider],
+        outputs=[output_textbox, diffusion_frames_state]
+    ).then(
+        fn=replay_diffusion,
+        inputs=[diffusion_frames_state, speed_slider],
+        outputs=[output_textbox]
+    )
+if __name__ == "__main__":
+    demo.launch()

app.py ADDED Viewed

	@@ -0,0 +1,798 @@

+import gradio as gr
+import spaces
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import numpy as np
+import math
+import os
+import pickle
+import requests
+import textwrap
+import subprocess
+import shutil
+import time
+from dataclasses import dataclass
+from typing import Optional
+from transformers import AutoTokenizer
+# ==============================================================================
+# ------------------------- VERSION 1: SHARED SETUP ----------------------------
+# ==============================================================================
+def setup_environment():
+    """Checks for and sets up the necessary data for V1."""
+    nano_gpt_repo_path = 'nanoGPT'
+    data_dir_path = 'shakespeare_char'
+    meta_path = os.path.join(data_dir_path, 'meta.pkl')
+    if os.path.exists(meta_path):
+        return
+    print("Required data not found. Starting one-time setup...")
+    if not os.path.exists(nano_gpt_repo_path):
+        try:
+            subprocess.run(['git', 'clone', 'https://github.com/karpathy/nanoGPT.git'], check=True, capture_output=True, text=True)
+        except subprocess.CalledProcessError as e:
+            print(f"Error cloning repository: {e.stderr}")
+            pass
+    source_data_dir = os.path.join(nano_gpt_repo_path, 'data', 'shakespeare_char')
+    if not os.path.exists(data_dir_path) and os.path.exists(source_data_dir):
+        shutil.copytree(source_data_dir, data_dir_path)
+    # Check if we can run prepare
+    prepare_script_path = os.path.join(data_dir_path, 'prepare.py')
+    if os.path.exists(prepare_script_path) and not os.path.exists(meta_path):
+        subprocess.run(['python', 'prepare.py'], check=True, cwd=data_dir_path, capture_output=True, text=True)
+setup_environment()
+def download_file(url, filename):
+    if os.path.exists(filename):
+        return
+    print(f"Downloading '{filename}'...")
+    try:
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+        with open(filename, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading {url}: {e}")
+# ==============================================================================
+# ---------------------- VERSION 1: ARCHITECTURE & LOGIC -----------------------
+# ==============================================================================
+# V1 Constants and Meta Loading
+v1_data_dir = './shakespeare_char/'
+v1_meta_url = 'https://huggingface.co/spaces/thejagstudio/diffusion-gpt/resolve/main/meta.pkl'
+v1_meta_path = 'meta.pkl'
+download_file(v1_meta_url, v1_meta_path)
+v1_vocab_size = 65 # Fallback
+v1_itos = {}
+v1_stoi = {}
+if os.path.exists(v1_meta_path):
+    with open(v1_meta_path, 'rb') as f:
+        meta = pickle.load(f)
+        v1_vocab_size = meta['vocab_size']
+        v1_itos = meta['itos']
+        v1_stoi = meta['stoi']
+v1_context_length = 256
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+def v1_decode(indices_tensor: torch.Tensor):
+    if indices_tensor.dim() > 1:
+        indices_tensor = indices_tensor.squeeze(0)
+    indices = indices_tensor.cpu().numpy()
+    return ''.join([v1_itos.get(i, '?') for i in indices])
+def wrap_text(long_text, width=80):
+    paragraphs = long_text.splitlines()
+    wrapped = [textwrap.fill(p, width=width) if p else '' for p in paragraphs]
+    return "\n".join(wrapped)
+@dataclass
+class V1_GPTConfig:
+    block_size: int = 1024
+    vocab_size: int = 50304
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    cond_dim: int = 64
+    dropout: float = 0.0
+    bias: bool = False
+class V1_MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu    = nn.GELU()
+        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class V1_SelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+    def forward(self, x):
+        B, T, C = x.size()
+        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
+        if self.flash:
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=False)
+        else:
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+def v1_modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+    return x * (1 + scale) + shift
+def v1_bias_add_scale(x: torch.Tensor, bias: Optional[torch.Tensor], scale: torch.Tensor, residual: Optional[torch.Tensor]) -> torch.Tensor:
+    if bias is not None:
+        out = scale * (x + bias)
+    else:
+        out = scale * x
+    if residual is not None:
+        out = residual + out
+    return out
+class V1_DDiTBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = V1_SelfAttention(config)
+        self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = V1_MLP(config)
+        self.adaLN_modulation = nn.Linear(config.cond_dim, 6 * config.n_embd)
+        self.adaLN_modulation.weight.data.zero_()
+        self.adaLN_modulation.bias.data.zero_()
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c)[:, None].chunk(6, dim=2)
+        x_skip = x
+        x = v1_modulate(self.ln_1(x), shift_msa, scale_msa)
+        x = self.attn(x)
+        x = v1_bias_add_scale(self.attn(self.ln_1(x)), None, gate_msa, x_skip)
+        x = v1_bias_add_scale(self.mlp(v1_modulate(self.ln_2(x), shift_mlp, scale_mlp)), None, gate_mlp, x)
+        return x
+class V1_DDitFinalLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(config.n_embd, bias=config.bias)
+        self.linear = nn.Linear(config.n_embd, config.vocab_size)
+        self.linear.weight.data.zero_()
+        self.linear.bias.data.zero_()
+        self.adaLN_modulation = nn.Linear(config.cond_dim, 2 * config.n_embd)
+        self.adaLN_modulation.weight.data.zero_()
+        self.adaLN_modulation.bias.data.zero_()
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c)[:, None].chunk(2, dim=2)
+        x = v1_modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class V1_TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class V1_GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+        self.sigma_map = V1_TimestepEmbedder(config.cond_dim)
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            wpe = nn.Embedding(config.block_size, config.n_embd),
+            drop = nn.Dropout(config.dropout),
+            h = nn.ModuleList([V1_DDiTBlock(config) for _ in range(config.n_layer)]),
+            ln_f = nn.LayerNorm(config.n_embd, bias=config.bias),
+        ))
+        self.lm_head = V1_DDitFinalLayer(config)
+        self.apply(self._init_weights)
+        for pn, p in self.named_parameters():
+            if pn.endswith('c_proj.weight'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, sigma):
+        sigma = sigma.reshape(-1)
+        b, t = idx.size()
+        c = F.silu(self.sigma_map(sigma))
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        pos = torch.arange(0, t, dtype=torch.long, device=device)
+        tok_emb = self.transformer.wte(idx)
+        pos_emb = self.transformer.wpe(pos)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        for block in self.transformer.h:
+            x = block(x, c)
+        x = self.transformer.ln_f(x)
+        x = self.lm_head(x, c)
+        x = torch.scatter(x, -1, idx[..., None], torch.zeros_like(x[..., :1]))
+        return x
+class V1_GeometricNoise:
+    def __init__(self, sigma_min=1e-4, sigma_max=20):
+        self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max]).to(device)
+    def rate_noise(self, t):
+        return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log())
+    def total_noise(self, t):
+        return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
+    def __call__(self, t):
+        return self.total_noise(t), self.rate_noise(t)
+# --- V1 Inference Logic ---
+def v1_transition(x_t: torch.Tensor, delta_sigma: torch.Tensor) -> torch.Tensor:
+    base_prob = (1 - torch.exp(-delta_sigma[..., None])) / v1_vocab_size
+    trans = torch.ones(*x_t.shape, v1_vocab_size, device=x_t.device) * base_prob
+    trans = trans.scatter(-1, x_t[..., None], torch.zeros_like(trans))
+    diag_fill = 1 - trans.sum(dim=-1, keepdim=True)
+    trans = trans.scatter(-1, x_t[..., None], diag_fill)
+    return trans
+def v1_staggered_score(score, delta_sigma):
+    exp_factor = torch.exp(-delta_sigma)[..., None]
+    correction = ((exp_factor - 1) / (v1_vocab_size * exp_factor)) * score.sum(dim=-1, keepdim=True)
+    return correction + score / exp_factor
+def v1_sample_categorical(probs: torch.Tensor) -> torch.Tensor:
+    eps = 1e-10
+    gumbel_noise = -torch.log(-torch.log(torch.rand_like(probs) + eps) + eps)
+    return torch.argmax(torch.log(probs + eps) + gumbel_noise, dim=-1)
+# --- V1 Model Loading ---
+print("Initializing V1 Model...")
+v1_model_args = dict(n_layer=6, n_head=6, n_embd=384, cond_dim=64,
+                 bias=False, vocab_size=v1_vocab_size, block_size=v1_context_length, dropout=0.2)
+v1_config = V1_GPTConfig(**v1_model_args)
+v1_model = V1_GPT(v1_config)
+try:
+    v1_model.load_state_dict(
+        torch.hub.load_state_dict_from_url(
+            'https://huggingface.co/spaces/thejagstudio/diffusion-gpt/resolve/main/final_model.pth?download=true',
+            map_location=device
+        )
+    )
+    v1_model.to(device)
+    v1_model.eval()
+    print("V1 Model loaded successfully.")
+except Exception as e:
+    print(f"Failed to load V1 model: {e}")
+    v1_model = None
+v1_noise = V1_GeometricNoise(sigma_min=1e-4, sigma_max=20)
+def v1_generate_stream(steps, speed):
+    """
+    Generator function for V1 that yields frames directly.
+    Combined logic of generation and replay to allow for immediate stopping.
+    """
+    if v1_model is None:
+        yield "Error: V1 Model not loaded"
+        return
+    steps = int(steps)
+    speed = float(speed)
+    eps = 1e-5
+    # Calculate delay based on speed slider (similar to V2)
+    # 0.5 is base constant, speed scales it down
+    delay = 0.5 / max(speed, 0.1)
+    x = torch.randint(0, v1_vocab_size, (1, v1_context_length), device=device)
+    initial_text = f"--- Initial Random Noise ---\n\n{wrap_text(v1_decode(x[0]))}"
+    yield initial_text
+    time.sleep(delay)
+    timesteps = torch.linspace(1, eps, steps + 1, device=device)
+    step_size = (1 - eps) / steps
+    with torch.no_grad():
+        for i in range(steps):
+            t = timesteps[i] * torch.ones(x.shape[0], 1, device=device)
+            curr_sigma_bar = v1_noise(t)[0]
+            next_sigma_bar = v1_noise(t - step_size)[0]
+            delta_sigma = curr_sigma_bar - next_sigma_bar
+            log_score = v1_model(x, curr_sigma_bar)
+            score = torch.exp(log_score)
+            stag_score = v1_staggered_score(score, delta_sigma)
+            probs = stag_score * v1_transition(x, delta_sigma)
+            x = v1_sample_categorical(probs)
+            progress_text = f"--- Denoising Step {i + 1}/{steps} ---\n\n{wrap_text(v1_decode(x[0]))}"
+            yield progress_text
+            # Artificial delay for visualization
+            if speed < 20:
+                time.sleep(delay)
+        t = timesteps[steps] * torch.ones(x.shape[0], 1, device=device)
+        curr_sigma_bar = v1_noise(t)[0]
+        delta_sigma = curr_sigma_bar
+        log_score = v1_model(x, curr_sigma_bar)
+        score = torch.exp(log_score)
+        stag_score = v1_staggered_score(score, delta_sigma)
+        probs = stag_score * v1_transition(x, delta_sigma)
+        x = v1_sample_categorical(probs)
+    final_text = f"--- Final Denoised Text (Step {steps}) ---\n\n{wrap_text(v1_decode(x[0]))}"
+    yield final_text
+# ==============================================================================
+# ---------------------- VERSION 2: ARCHITECTURE & LOGIC -----------------------
+# ==============================================================================
+# PLEASE UPDATE THIS PATH TO YOUR ACTUAL LOCAL FILE OR URL
+V2_MODEL_PATH = "checkpoints/model_fp32.pt"
+class V2_RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        var = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(var + self.eps)
+        return self.weight * x
+class V2_RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=16384, base=100000, scaling_factor=1.0):
+        super().__init__()
+        self.scaling_factor = scaling_factor
+        self.dim = dim
+        self.base = base
+        self.max_position_embeddings = max_position_embeddings
+        self.inv_freq = None
+        self._cache = {}
+    def _update_freqs(self, device):
+        base = self.base * (self.scaling_factor ** (self.dim / (self.dim - 2)))
+        inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.inv_freq = inv_freq
+    def forward(self, x, seq_len=None):
+        if seq_len is None:
+            seq_len = x.shape[-2]
+        if self.inv_freq is None or self.inv_freq.device != x.device:
+            self._update_freqs(x.device)
+        cache_key = (seq_len, x.device, x.dtype)
+        if cache_key in self._cache:
+            return self._cache[cache_key]
+        t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos()[None, None, :, :]
+        sin = emb.sin()[None, None, :, :]
+        self._cache[cache_key] = (cos, sin)
+        if len(self._cache) > 10:
+            self._cache.pop(next(iter(self._cache)))
+        return cos, sin
+def v2_apply_rotary_pos_emb(q, k, cos, sin):
+    def rotate_half(x):
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class V2_DiffusionAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.use_flash_attn = config.use_flash_attn
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+    def forward(self, hidden_states, freqs_cis, attention_mask=None, past_kv=None):
+        bsz, q_len, _ = hidden_states.size()
+        q = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        cos, sin = freqs_cis
+        cos = cos[:, :, :q_len, :]
+        sin = sin[:, :, :q_len, :]
+        q, k = v2_apply_rotary_pos_emb(q, k, cos, sin)
+        if past_kv is not None:
+            cache_k, cache_v = past_kv
+            k = torch.cat([cache_k, k], dim=2)
+            v = torch.cat([cache_v, v], dim=2)
+        current_kv = (k, v)
+        k = k.repeat_interleave(self.num_key_value_groups, dim=1)
+        v = v.repeat_interleave(self.num_key_value_groups, dim=1)
+        attn_mask = None
+        if attention_mask is not None:
+            attn_mask = attention_mask[:, None, None, :].to(dtype=q.dtype)
+            attn_mask = (1.0 - attn_mask) * torch.finfo(q.dtype).min
+        output = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, dropout_p=0.0, is_causal=False
+        )
+        output = output.transpose(1, 2).contiguous().view(bsz, q_len, self.hidden_size)
+        return self.o_proj(output), current_kv
+class V2_MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.act_fn = nn.SiLU()
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class V2_BlockDiffusionBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self_attn = V2_DiffusionAttention(config)
+        self.mlp = V2_MLP(config)
+        self.input_layernorm = V2_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = V2_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.use_activation_checkpointing = config.use_activation_checkpointing
+    def forward(self, hidden_states, freqs_cis, attention_mask, past_kv):
+        return self._forward(hidden_states, freqs_cis, attention_mask, past_kv)
+    def _forward(self, hidden_states, freqs_cis, attention_mask, past_kv):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_out, new_kv = self.self_attn(hidden_states, freqs_cis, attention_mask, past_kv)
+        hidden_states = residual + attn_out
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + self.mlp(hidden_states)
+        return hidden_states, new_kv
+@dataclass
+class V2_ModelConfig:
+    vocab_size: int = 151936
+    hidden_size: int = 1024
+    intermediate_size: int = 2816
+    num_hidden_layers: int = 16
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 4
+    max_position_embeddings: int = 16384
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 100000.0
+    pad_token_id: int = 0
+    mask_token_id: int = 1
+    use_flash_attn: bool = True
+    use_activation_checkpointing: bool = False
+    attention_dropout: float = 0.0
+    hidden_dropout: float = 0.0
+ModelConfig = V2_ModelConfig
+class V2_DiffusionLLM(nn.Module):
+    def __init__(self, config: V2_ModelConfig):
+        super().__init__()
+        self.config = config
+        pad_idx = config.pad_token_id if config.pad_token_id < config.vocab_size else None
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=pad_idx)
+        self.layers = nn.ModuleList([V2_BlockDiffusionBlock(config) for _ in range(config.num_hidden_layers)])
+        self.norm = V2_RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.rotary_emb = V2_RotaryEmbedding(
+            config.hidden_size // config.num_attention_heads,
+            config.max_position_embeddings
+        )
+        self.lm_head.weight = self.embed_tokens.weight
+    def forward(self, input_ids, attention_mask=None, past_key_values=None):
+        bsz, seqlen = input_ids.shape
+        hidden_states = self.embed_tokens(input_ids)
+        freqs_cis = self.rotary_emb(hidden_states, seq_len=seqlen)
+        if past_key_values is None:
+            past_key_values = [None] * len(self.layers)
+        new_kvs = []
+        for i, layer in enumerate(self.layers):
+            hidden_states, kv = layer(hidden_states, freqs_cis, attention_mask, past_key_values[i])
+            new_kvs.append(kv)
+        hidden_states = self.norm(hidden_states)
+        logits = self.lm_head(hidden_states)
+        return logits, new_kvs
+DiffusionLLM = V2_DiffusionLLM
+# --- V2 Loading Logic ---
+print("Initializing V2 components...")
+v2_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
+if v2_tokenizer.pad_token is None:
+    v2_tokenizer.pad_token = v2_tokenizer.eos_token
+v2_model = None
+v2_config = None
+if os.path.exists(V2_MODEL_PATH):
+    print(f"Loading V2 model from {V2_MODEL_PATH}...")
+    try:
+        checkpoint = torch.load(V2_MODEL_PATH, map_location=device, weights_only=False)
+        v2_config = checkpoint['config']
+        v2_model = V2_DiffusionLLM(v2_config)
+        state_dict = checkpoint['model_state']
+        state_dict = {k: v.float() for k, v in state_dict.items()}
+        v2_model.load_state_dict(state_dict)
+        v2_model = v2_model.to(device)
+        v2_model.eval()
+        print("V2 Model loaded.")
+    except Exception as e:
+        print(f"Error loading V2 model: {e}")
+else:
+    print(f"V2 Model file not found at {V2_MODEL_PATH}. Version 2 tab will not work without it.")
+@torch.no_grad()
+def v2_generate_block_diffusion(prompt, steps, block_size, max_new_tokens, replay_speed):
+    """
+    Refactored to yield frames for real-time streaming.
+    """
+    if v2_model is None:
+        yield "Error: V2 Model not found. Check path."
+        return
+    v2_model.eval()
+    # Handle inputs
+    steps = int(steps)
+    block_size = int(block_size)
+    max_new_tokens = int(max_new_tokens)
+    speed = float(replay_speed)
+    prompt_ids = v2_tokenizer.encode(prompt, return_tensors="pt").to(device)
+    config = v2_model.config
+    num_blocks = max_new_tokens // block_size
+    context_ids = prompt_ids
+    # Helper params
+    temperature = 1.0
+    top_k = 40
+    top_p = 0.9
+    repetition_penalty = 1.2
+    # Calculate delay based on speed slider
+    delay = 0.5 / max(speed, 0.1)
+    for block_idx in range(num_blocks):
+        mask_block = torch.full((1, block_size), config.mask_token_id, device=device)
+        is_masked = torch.ones(1, block_size, dtype=torch.bool, device=device)
+        for step_idx in range(steps):
+            # --- SNAPSHOT & YIELD ---
+            # Decode context
+            ctx_str = v2_tokenizer.decode(context_ids[0], skip_special_tokens=True)
+            # Decode block with masking visual
+            block_tokens = mask_block[0].tolist()
+            block_vis = []
+            for i, tid in enumerate(block_tokens):
+                if is_masked[0, i]:
+                    block_vis.append("░") # Mask symbol
+                else:
+                    block_vis.append(v2_tokenizer.decode([tid], skip_special_tokens=False))
+            block_str = "".join(block_vis)
+            frame_text = (f"--- Generating Block {block_idx+1}/{num_blocks} | Step {step_idx+1}/{steps} ---\n\n"
+                          f"{ctx_str}{block_str}")
+            yield frame_text
+            # Artificial delay to visualize the step
+            if speed < 20: # If max speed, skip sleep
+                time.sleep(delay)
+            # ------------------------
+            full_input = torch.cat([context_ids, mask_block], dim=1)
+            attention_mask = torch.ones_like(full_input, dtype=torch.float32)
+            logits, _ = v2_model(full_input, attention_mask=attention_mask)
+            block_logits = logits[:, -block_size:, :]
+            # Repetition penalty
+            if repetition_penalty != 1.0:
+                seen_tokens = set(context_ids[0].tolist())
+                for i in range(block_size):
+                    if not is_masked[0, i]:
+                        seen_tokens.add(mask_block[0, i].item())
+                for token_id in seen_tokens:
+                    if token_id < block_logits.shape[-1]:
+                        if block_logits[0, :, token_id].mean() > 0:
+                            block_logits[:, :, token_id] /= repetition_penalty
+                        else:
+                            block_logits[:, :, token_id] *= repetition_penalty
+            block_logits = block_logits / temperature
+            # Top-K
+            if top_k > 0:
+                top_k_logits, top_k_indices = torch.topk(block_logits, top_k, dim=-1)
+                block_logits = torch.full_like(block_logits, float('-inf'))
+                block_logits.scatter_(-1, top_k_indices, top_k_logits)
+            # Top-P
+            if top_p < 1.0:
+                sorted_logits, sorted_indices = torch.sort(block_logits, descending=True, dim=-1)
+                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                sorted_indices_to_remove[..., 0] = 0
+                indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
+                block_logits[indices_to_remove] = float('-inf')
+            probs = F.softmax(block_logits, dim=-1)
+            probs = torch.nan_to_num(probs, nan=0.0, posinf=0.0, neginf=0.0)
+            probs = probs.clamp(min=1e-10)
+            probs = probs / probs.sum(dim=-1, keepdim=True)
+            sampled_tokens = torch.multinomial(probs.view(-1, probs.size(-1)), num_samples=1)
+            sampled_tokens = sampled_tokens.view(1, block_size)
+            confidence = probs.gather(-1, sampled_tokens.unsqueeze(-1)).squeeze(-1)
+            tokens_to_unmask = max(1, block_size // steps)
+            if step_idx == steps - 1:
+                tokens_to_unmask = is_masked.sum().item()
+            if tokens_to_unmask > 0 and is_masked.sum() > 0:
+                masked_confidence = confidence.clone()
+                masked_confidence[~is_masked] = -1.0
+                num_to_unmask = min(tokens_to_unmask, is_masked.sum().item())
+                _, top_indices = torch.topk(masked_confidence.view(-1), num_to_unmask)
+                for idx in top_indices:
+                    mask_block[0, idx] = sampled_tokens[0, idx]
+                    is_masked[0, idx] = False
+        context_ids = torch.cat([context_ids, mask_block], dim=1)
+    generated_ids = context_ids[0].tolist()
+    final_text = v2_tokenizer.decode(generated_ids, skip_special_tokens=True)
+    yield final_text
+# ==============================================================================
+# ------------------------------- GRADIO UI ------------------------------------
+# ==============================================================================
+css = '''.gradio-container > .fillable {max-width: 900px !important}
+h3{margin-top: 1em}
+p{margin-top: 0}
+textarea{font-family: monospace; background-color: #1a1b1e; color: #e0e0e0}
+'''
+with gr.Blocks(theme=gr.themes.Citrus(), css=css) as demo:
+    gr.Markdown("# Diffusion Language Models Playground")
+    with gr.Tabs():
+        # --- TAB 1: VERSION 1 (CHAR DIFFUSION) ---
+        with gr.Tab("Version 1: Character Diffusion (NanoGPT)"):
+            gr.Markdown("### Tiny 11M parameter character-based continuous diffusion.")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    v1_steps = gr.Slider(64, 512, 128, step=1, label="Denoising Steps")
+                    v1_speed = gr.Slider(1, 20, 10, step=1, label="Generation/Replay Speed")
+                    with gr.Row():
+                        v1_btn = gr.Button("Generate", variant="primary")
+                        v1_stop = gr.Button("Stop", variant="stop")
+                with gr.Column(scale=2):
+                    v1_out = gr.Textbox(label="Generated Text", lines=15, interactive=False)
+            # V1 Logic: Merged generation and replay for proper stopping
+            v1_event = v1_btn.click(v1_generate_stream, inputs=[v1_steps, v1_speed], outputs=[v1_out])
+            v1_stop.click(fn=None, inputs=None, outputs=None, cancels=[v1_event])
+        # --- TAB 2: VERSION 2 (BLOCK DIFFUSION) ---
+        with gr.Tab("Version 2: Block Diffusion (LLaDA-style)"):
+            gr.Markdown("### Block-based diffusion using Qwen tokenizer.")
+            if v2_model is None:
+                gr.Warning(f"V2 Model not loaded. Please check path: {V2_MODEL_PATH}")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    v2_prompt = gr.Textbox(label="Prompt", value="The king went to the")
+                    v2_steps = gr.Slider(4, 64, 16, step=1, label="Steps per Block")
+                    v2_block_size = gr.Slider(8, 126, 32, step=8, label="Block Size")
+                    v2_max_tokens = gr.Slider(32, 1024, 128, step=32, label="Total New Tokens")
+                    v2_speed = gr.Slider(1, 20, 1, step=1, label="Generation/Replay Speed")
+                    with gr.Row():
+                        v2_btn = gr.Button("Generate", variant="primary")
+                        v2_stop = gr.Button("Stop", variant="stop")
+                with gr.Column(scale=2):
+                    v2_out = gr.Textbox(label="Generated Text", lines=15, interactive=False)
+            # V2 Logic
+            v2_event = v2_btn.click(
+                v2_generate_block_diffusion,
+                inputs=[v2_prompt, v2_steps, v2_block_size, v2_max_tokens, v2_speed],
+                outputs=[v2_out]
+            )
+            v2_stop.click(fn=None, inputs=None, outputs=None, cancels=[v2_event])
+if __name__ == "__main__":
+    demo.launch()

final_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86da61c7e77f7062e51ec2974a723d6f195581e385ed802373d178ade0c81483
+size 47047458

meta.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f221fcf4332ffda72912fad6e4b64e7988d6c4cba7ffcee98edbbc23f9a8400d
+size 913

model_epoch_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:413c52559c94e2e71d991e45147c8773e02d007b095443ff2ffaed5174e147a7
+size 47038242

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+torch>=2.0.0
+numpy
+requests
+transformers