rubenaghayan commited on
Commit
75dbc58
·
1 Parent(s): f45427d

final edits

Browse files
Files changed (3) hide show
  1. app.py +47 -24
  2. details.py +12 -15
  3. limitations.py +12 -24
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from functools import partial
@@ -6,17 +9,12 @@ from details import DETAILS
6
  from state import Model, Parallelism, Training
7
  from calculator import MemoryCalculation
8
  from dtypes import DType
9
- from gradio.themes import ThemeClass as Theme
10
  from limitations import LIMITATIONS
11
 
12
  # Create a Number component for natural numbers (positive integers)
13
  NaturalNumber = partial(gr.Number, minimum=1, step=1, precision=0, interactive=True)
14
 
15
- colors = {
16
-
17
- }
18
-
19
-
20
  def create_parallelism_block():
21
  with gr.Column():
22
  gr.Markdown("# Parallelism")
@@ -235,12 +233,11 @@ def calculate(tp, pp, cp, ep, fsdp_enabled, fsdp_parallelism, fsdp_strategy, lay
235
 
236
  memory_data = pd.DataFrame(individual_data)
237
 
238
- # Define pastel color map
239
  color_map = {
240
- 'Parameter': '#B6E5D8', # Light Mint
241
- 'Gradient': '#FFB6C1', # Light Pink
242
- 'Optimizer': '#C7B3FF', # Light Purple
243
- 'Activation': '#FFD1A9', # Light Peach
244
  }
245
 
246
  return gr.BarPlot(
@@ -279,11 +276,17 @@ css = """
279
  }
280
  """
281
 
282
- theme = Theme.from_hub("gstaff/xkcd")
283
- # otherwise invisible in light mode
284
- theme.checkbox_label_text_color=theme.block_label_text_color
285
- with gr.Blocks(theme=theme, css=css) as demo:
286
  with gr.Column():
 
 
 
 
 
 
 
 
 
287
  with gr.Row(equal_height=True):
288
  tp, pp, cp, ep, fsdp_enabled, fsdp_parallelism, fsdp_strategy = create_parallelism_block()
289
  layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, presets, weight_tied_embeddings = create_model_block()
@@ -293,18 +296,38 @@ with gr.Blocks(theme=theme, css=css) as demo:
293
 
294
  calculate_button.click(
295
  fn=calculate,
296
- inputs=[tp, pp, cp, ep, fsdp_enabled, fsdp_parallelism, fsdp_strategy, layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, weight_tied_embeddings, seq_len, batch_size, gradient_checkpointing, grad_accumulation, precision, mixed_precision, param_dtype, reduce_dtype],
297
- outputs=output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  )
299
 
300
- # Limitations and Comments section
301
  with gr.Row():
302
- with gr.Column():
303
- gr.Markdown("# Limitations")
304
- gr.Markdown(LIMITATIONS)
305
- with gr.Column():
306
- gr.Markdown("# Comments and Details")
307
- gr.Markdown(DETAILS)
308
 
309
 
310
  demo.launch()
 
1
+ import json
2
+ from pathlib import Path
3
+
4
  import gradio as gr
5
  import pandas as pd
6
  from functools import partial
 
9
  from state import Model, Parallelism, Training
10
  from calculator import MemoryCalculation
11
  from dtypes import DType
12
+ from gradio.themes import Base
13
  from limitations import LIMITATIONS
14
 
15
  # Create a Number component for natural numbers (positive integers)
16
  NaturalNumber = partial(gr.Number, minimum=1, step=1, precision=0, interactive=True)
17
 
 
 
 
 
 
18
  def create_parallelism_block():
19
  with gr.Column():
20
  gr.Markdown("# Parallelism")
 
233
 
234
  memory_data = pd.DataFrame(individual_data)
235
 
 
236
  color_map = {
237
+ 'Parameter': '#C2A9CE',
238
+ 'Gradient': '#AADB86',
239
+ 'Optimizer': '#F79090',
240
+ 'Activation': '#9EC7DD',
241
  }
242
 
243
  return gr.BarPlot(
 
276
  }
277
  """
278
 
279
+ with gr.Blocks(theme='Base', css=css) as demo:
 
 
 
280
  with gr.Column():
281
+ gr.Markdown("# LLM Training Memory Visualizer")
282
+ gr.Markdown(
283
+ """
284
+ ## How to Use
285
+ 1. Use Presets OR Adjust the parallelism, model, and training panels to match your run.
286
+ 2. Press **Calculate** to refresh the memory breakdown chart.
287
+ 3. Review the details and references below for context on the estimates.
288
+ """
289
+ )
290
  with gr.Row(equal_height=True):
291
  tp, pp, cp, ep, fsdp_enabled, fsdp_parallelism, fsdp_strategy = create_parallelism_block()
292
  layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, presets, weight_tied_embeddings = create_model_block()
 
296
 
297
  calculate_button.click(
298
  fn=calculate,
299
+ inputs=[
300
+ tp,
301
+ pp,
302
+ cp,
303
+ ep,
304
+ fsdp_enabled,
305
+ fsdp_parallelism,
306
+ fsdp_strategy,
307
+ layers,
308
+ vocab,
309
+ hidden,
310
+ intermediate,
311
+ active_experts,
312
+ total_experts,
313
+ is_moe,
314
+ weight_tied_embeddings,
315
+ seq_len,
316
+ batch_size,
317
+ gradient_checkpointing,
318
+ grad_accumulation,
319
+ precision,
320
+ mixed_precision,
321
+ param_dtype,
322
+ reduce_dtype,
323
+ ],
324
+ outputs=output,
325
  )
326
 
327
+ gr.Markdown("# Details")
328
  with gr.Row():
329
+ gr.Markdown(LIMITATIONS)
330
+ gr.Markdown(DETAILS)
 
 
 
 
331
 
332
 
333
  demo.launch()
details.py CHANGED
@@ -1,21 +1,18 @@
1
  DETAILS = """
2
- ### Resources I found helpful while building this tool:
 
 
 
 
 
 
 
 
 
3
  - [The Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook)
4
  - [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/abs/2205.05198)
5
  - [Transformer Math - Michael Wornow](https://michaelwornow.net/2024/01/18/counting-params-in-transformer)
6
  - [Transformer Math 101](https://blog.eleuther.ai/transformer-math/)
7
-
8
-
9
- ### Why this tool?
10
- While there are some good tools out there already:
11
- - [Hugging Face Model Memory Estimator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage)
12
- - [DeepSpeed Model Memory Calculator](https://huggingface.co/spaces/andstor/deepspeed-model-memory-usage)
13
- - [DeepSpeed Native Utility](https://deepspeed.readthedocs.io/en/latest/memory.html)
14
-
15
- None of them had all the features I wanted in one place. I wanted a tool that could:
16
- - Accept arbitrary model configurations
17
- - Support FSDP
18
- - Support 5d parallelism
19
- - Be interactive and break down memory usage by category, to better inform configurations.
20
-
21
  """
 
 
 
1
  DETAILS = """
2
+ ### Motivation
3
+ Existing tools like the [Hugging Face Model Memory Estimator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage), [DeepSpeed Calculator](https://huggingface.co/spaces/andstor/deepspeed-model-memory-usage), and [DeepSpeed Native Utility](https://deepspeed.readthedocs.io/en/latest/memory.html) are valuable but don't support the full range of modern training configurations.
4
+
5
+ This tool adds:
6
+ - Arbitrary model configurations beyond preset architectures
7
+ - FSDP and 5D parallelism support
8
+ - Interactive memory breakdowns by category to inform configuration decisions
9
+
10
+ ### References
11
+ Helpful resources used while building this:
12
  - [The Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook)
13
  - [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/abs/2205.05198)
14
  - [Transformer Math - Michael Wornow](https://michaelwornow.net/2024/01/18/counting-params-in-transformer)
15
  - [Transformer Math 101](https://blog.eleuther.ai/transformer-math/)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  """
17
+
18
+ INSTRUCTIONS = """ """
limitations.py CHANGED
@@ -1,27 +1,15 @@
1
  LIMITATIONS = """
2
- This calculator has many limitations and assumptions
3
- ### Assumptions:
4
- - Your implementation of tensor parallel also incorporates sequence parallel
5
- - You are doing selective recomputation with flash attention if not doing gradient checkpointing
6
- - You keep a master copy of the model weights for mixed precision
7
- - May not be true for some implementations which cast on the fly
8
- - You're using Adam optimizer
9
- - If using PP you're using a schedule that will keep the number of activations roughly the same
10
- - EP is the number of PPxTP units that share each expert
11
- - Swiglu activation function
12
- - Rotary embeddings
13
 
14
- ### Limitations:
15
- - Does not support non-homogenous layers
16
- - e.g. Llama4 Maverick with alternating dense and sparse layers, iRoPE
17
- - Does not include memory for kernel or framework overhead
18
- - Does not include memory for intermediates
19
- - Does not include vision layers for multi-modal models
20
- - Models shared experts as another routed expert per token
21
- - Does not support different dtypes for different parts of the model
22
- - e.g. MXFP4 for GPT-OSS 20 and 120B
23
- - Have not validated EP/FSDP interaction
24
- - Doesn't model biases on a per-model basis
25
 
26
- Note this is not an exhaustive list, just some of the main ones
27
- """
 
1
  LIMITATIONS = """
2
+ ### Key Assumptions:
3
+ - Standard transformer architecture with homogeneous layers
4
+ - Adam optimizer with mixed precision training (master weights copy)
5
+ - Tensor parallelism includes sequence parallelism
6
+ - Pipeline parallelism maintains consistent activation memory
 
 
 
 
 
 
7
 
8
+ ### Not Currently Supported:
9
+ - Non-standard architectures (alternating dense/sparse layers, custom attention)
10
+ - Multi-modal models with vision layers
11
+ - Mixed dtype training (e.g., MXFP4)
12
+ - Kernel/framework overhead and intermediate memory
 
 
 
 
 
 
13
 
14
+ For advanced configurations, results should be validated against profiling.
15
+ """