Spaces:

rubenaghayan
/

llm_memory_visualizer

Sleeping

App Files Files Community

rubenaghayan commited on Oct 12

Commit

75dbc58

1 Parent(s): f45427d

final edits

Browse files

Files changed (3) hide show

app.py +47 -24
details.py +12 -15
limitations.py +12 -24

app.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import gradio as gr
 import pandas as pd
 from functools import partial
@@ -6,17 +9,12 @@ from details import DETAILS
 from state import Model, Parallelism, Training
 from calculator import MemoryCalculation
 from dtypes import DType
-from gradio.themes import ThemeClass as Theme
 from limitations import LIMITATIONS
 # Create a Number component for natural numbers (positive integers)
 NaturalNumber = partial(gr.Number, minimum=1, step=1, precision=0, interactive=True)
-colors = {
-}
 def create_parallelism_block():
     with gr.Column():
         gr.Markdown("# Parallelism")
@@ -235,12 +233,11 @@ def calculate(tp, pp, cp, ep, fsdp_enabled, fsdp_parallelism, fsdp_strategy, lay
     memory_data = pd.DataFrame(individual_data)
-    # Define pastel color map
     color_map = {
-        'Parameter': '#B6E5D8',   # Light Mint
-        'Gradient': '#FFB6C1',    # Light Pink
-        'Optimizer': '#C7B3FF',   # Light Purple
-        'Activation': '#FFD1A9',  # Light Peach
     }
     return gr.BarPlot(
@@ -279,11 +276,17 @@ css = """
 }
 """
-theme = Theme.from_hub("gstaff/xkcd")
-# otherwise invisible in light mode
-theme.checkbox_label_text_color=theme.block_label_text_color
-with gr.Blocks(theme=theme, css=css) as demo:
     with gr.Column():
         with gr.Row(equal_height=True):
             tp, pp, cp, ep, fsdp_enabled, fsdp_parallelism, fsdp_strategy = create_parallelism_block()
             layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, presets, weight_tied_embeddings = create_model_block()
@@ -293,18 +296,38 @@ with gr.Blocks(theme=theme, css=css) as demo:
         calculate_button.click(
             fn=calculate,
-            inputs=[tp, pp, cp, ep, fsdp_enabled, fsdp_parallelism, fsdp_strategy, layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, weight_tied_embeddings, seq_len, batch_size, gradient_checkpointing, grad_accumulation, precision, mixed_precision, param_dtype, reduce_dtype],
-            outputs=output
         )
-        # Limitations and Comments section
         with gr.Row():
-            with gr.Column():
-                gr.Markdown("# Limitations")
-                gr.Markdown(LIMITATIONS)
-            with gr.Column():
-                gr.Markdown("# Comments and Details")
-                gr.Markdown(DETAILS)
 demo.launch()

+import json
+from pathlib import Path
 import gradio as gr
 import pandas as pd
 from functools import partial
 from state import Model, Parallelism, Training
 from calculator import MemoryCalculation
 from dtypes import DType
+from gradio.themes import Base
 from limitations import LIMITATIONS
 # Create a Number component for natural numbers (positive integers)
 NaturalNumber = partial(gr.Number, minimum=1, step=1, precision=0, interactive=True)
 def create_parallelism_block():
     with gr.Column():
         gr.Markdown("# Parallelism")
     memory_data = pd.DataFrame(individual_data)
     color_map = {
+        'Parameter': '#C2A9CE',
+        'Gradient': '#AADB86',
+        'Optimizer': '#F79090',
+        'Activation': '#9EC7DD',
     }
     return gr.BarPlot(
 }
 """
+with gr.Blocks(theme='Base', css=css) as demo:
     with gr.Column():
+        gr.Markdown("# LLM Training Memory Visualizer")
+        gr.Markdown(
+            """
+            ## How to Use
+            1. Use Presets OR Adjust the parallelism, model, and training panels to match your run.
+            2. Press **Calculate** to refresh the memory breakdown chart.
+            3. Review the details and references below for context on the estimates.
+            """
+        )
         with gr.Row(equal_height=True):
             tp, pp, cp, ep, fsdp_enabled, fsdp_parallelism, fsdp_strategy = create_parallelism_block()
             layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, presets, weight_tied_embeddings = create_model_block()
         calculate_button.click(
             fn=calculate,
+            inputs=[
+                tp,
+                pp,
+                cp,
+                ep,
+                fsdp_enabled,
+                fsdp_parallelism,
+                fsdp_strategy,
+                layers,
+                vocab,
+                hidden,
+                intermediate,
+                active_experts,
+                total_experts,
+                is_moe,
+                weight_tied_embeddings,
+                seq_len,
+                batch_size,
+                gradient_checkpointing,
+                grad_accumulation,
+                precision,
+                mixed_precision,
+                param_dtype,
+                reduce_dtype,
+            ],
+            outputs=output,
         )
+        gr.Markdown("# Details")
         with gr.Row():
+            gr.Markdown(LIMITATIONS)
+            gr.Markdown(DETAILS)
 demo.launch()

details.py CHANGED Viewed

@@ -1,21 +1,18 @@
 DETAILS = """
-### Resources I found helpful while building this tool:
 - [The Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook)
 - [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/abs/2205.05198)
 - [Transformer Math - Michael Wornow](https://michaelwornow.net/2024/01/18/counting-params-in-transformer)
 - [Transformer Math 101](https://blog.eleuther.ai/transformer-math/)
-### Why this tool?
-While there are some good tools out there already:
-- [Hugging Face Model Memory Estimator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage)
-- [DeepSpeed Model Memory Calculator](https://huggingface.co/spaces/andstor/deepspeed-model-memory-usage)
-- [DeepSpeed Native Utility](https://deepspeed.readthedocs.io/en/latest/memory.html)
-None of them had all the features I wanted in one place. I wanted a tool that could:
-- Accept arbitrary model configurations
-- Support FSDP
-- Support 5d parallelism
-- Be interactive and break down memory usage by category, to better inform configurations.
 """

 DETAILS = """
+### Motivation
+Existing tools like the [Hugging Face Model Memory Estimator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage), [DeepSpeed Calculator](https://huggingface.co/spaces/andstor/deepspeed-model-memory-usage), and [DeepSpeed Native Utility](https://deepspeed.readthedocs.io/en/latest/memory.html) are valuable but don't support the full range of modern training configurations.
+This tool adds:
+- Arbitrary model configurations beyond preset architectures
+- FSDP and 5D parallelism support
+- Interactive memory breakdowns by category to inform configuration decisions
+### References
+Helpful resources used while building this:
 - [The Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook)
 - [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/abs/2205.05198)
 - [Transformer Math - Michael Wornow](https://michaelwornow.net/2024/01/18/counting-params-in-transformer)
 - [Transformer Math 101](https://blog.eleuther.ai/transformer-math/)
 """
+INSTRUCTIONS = """ """

limitations.py CHANGED Viewed

@@ -1,27 +1,15 @@
 LIMITATIONS = """
-This calculator has many limitations and assumptions
-### Assumptions:
-- Your implementation of tensor parallel also incorporates sequence parallel
-- You are doing selective recomputation with flash attention if not doing gradient checkpointing
-- You keep a master copy of the model weights for mixed precision
-    - May not be true for some implementations which cast on the fly
-- You're using Adam optimizer
-- If using PP you're using a schedule that will keep the number of activations roughly the same
-- EP is the number of PPxTP units that share each expert
-- Swiglu activation function
-- Rotary embeddings
-### Limitations:
-- Does not support non-homogenous layers
-    - e.g. Llama4 Maverick with alternating dense and sparse layers, iRoPE
-- Does not include memory for kernel or framework overhead
-- Does not include memory for intermediates
-- Does not include vision layers for multi-modal models
-- Models shared experts as another routed expert per token
-- Does not support different dtypes for different parts of the model
-    - e.g. MXFP4 for GPT-OSS 20 and 120B
-- Have not validated EP/FSDP interaction
-- Doesn't model biases on a per-model basis
-Note this is not an exhaustive list, just some of the main ones
-"""

 LIMITATIONS = """
+### Key Assumptions:
+- Standard transformer architecture with homogeneous layers
+- Adam optimizer with mixed precision training (master weights copy)
+- Tensor parallelism includes sequence parallelism
+- Pipeline parallelism maintains consistent activation memory
+### Not Currently Supported:
+- Non-standard architectures (alternating dense/sparse layers, custom attention)
+- Multi-modal models with vision layers
+- Mixed dtype training (e.g., MXFP4)
+- Kernel/framework overhead and intermediate memory
+For advanced configurations, results should be validated against profiling.
+"""