rubenaghayan's picture
added support for precision
97e312a
raw
history blame
6.43 kB
import gradio as gr
import pandas as pd
from functools import partial
from defaults import DEFAULTS
from state import Model, Parallelism, Training
from calculator import MemoryCalculation
from dtypes import DType
# Create a Number component for natural numbers (positive integers)
NaturalNumber = partial(gr.Number, minimum=1, step=1, precision=0, interactive=True)
def greet(name, intensity) -> str:
return "Hello, " + name + "!" * int(intensity)
def create_parallelism_block():
with gr.Column():
gr.Markdown("# Parallelism")
with gr.Group():
tp = NaturalNumber(label="Tensor Parallelism", value=1)
pp = NaturalNumber(label="Pipeline Parallelism", value=1)
cp = NaturalNumber(label="Context Parallelism", value=1)
ep = NaturalNumber(label="Expert Parallelism", value=1)
return tp, pp, cp, ep
def create_model_block():
with gr.Column():
gr.Markdown("# Model Architecture")
layers = NaturalNumber(label="Number of Layers", value=32)
vocab = NaturalNumber(label="Vocab Size", value=32000)
hidden = NaturalNumber(label="Hidden Dim", value=4096)
intermediate = NaturalNumber(label="Intermediate Dim", value=11008)
is_moe = gr.Checkbox(label="Mixture of Experts (MoE)", value=False)
active_experts = NaturalNumber(label="Active Experts", value=2, visible=False)
total_experts = NaturalNumber(label="Total Experts", value=8, visible=False)
# Toggle expert fields visibility based on MoE checkbox
is_moe.change(
fn=lambda x: [gr.update(visible=x), gr.update(visible=x)],
inputs=is_moe,
outputs=[active_experts, total_experts]
)
# not ready yet
# presets = gr.Dropdown(list(DEFAULTS.keys()), label="Presets", interactive=True)
return layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, presets
def create_training_block():
with gr.Column():
gr.Markdown("# Training Config")
seq_len = NaturalNumber(label="Sequence Length", value=8192)
batch_size = NaturalNumber(label="Batch Size", info="If you are using gradient accumulation, enter microbatch size", value=8)
with gr.Row():
gradient_checkpointing = gr.Checkbox(label="Gradient Checkpointing", value=False)
grad_accumulation = gr.Checkbox(label="Gradient Accumulation", value=False)
precision = gr.Dropdown(DType.values(), label="Precision", value=DType.FP32.value, interactive=True)
mixed_precision = gr.Checkbox(label="Mixed Precision", value=False)
param_dtype = gr.Dropdown(DType.values(), label="Parameter Dtype", value=DType.FP32.value, interactive=True, visible=False)
reduce_dtype = gr.Dropdown(DType.values(), label="Reduce Dtype", value=DType.FP32.value, interactive=True, visible=False)
# Toggle dtype fields visibility based on mixed precision checkbox
mixed_precision.change(
fn=lambda x: [gr.update(visible=x), gr.update(visible=x)],
inputs=mixed_precision,
outputs=[param_dtype, reduce_dtype]
)
return seq_len, batch_size, gradient_checkpointing, grad_accumulation, precision, mixed_precision, param_dtype, reduce_dtype
def calculate(tp, pp, cp, ep, layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, seq_len, batch_size, gradient_checkpointing, grad_accumulation, precision, mixed_precision, param_dtype, reduce_dtype):
# Create state objects
model_config = Model(
vocab_size=int(vocab),
num_layers=int(layers),
hidden_dim=int(hidden),
intermediate_size=int(intermediate),
weight_tied_embeddings=True, # Default assumption
active_experts=int(active_experts),
total_experts=int(total_experts),
is_moe=is_moe
)
parallelism_config = Parallelism(
tensor_parallelism=int(tp),
pipeline_parallelism=int(pp),
context_parallelism=int(cp),
expert_parallelism=int(ep)
)
training_config = Training(
sequence_length=int(seq_len),
batch_size=int(batch_size),
gradient_checkpointing=gradient_checkpointing,
grad_accumulation=grad_accumulation,
precision=DType(precision),
mixed_precision=mixed_precision,
param_dtype=DType(param_dtype),
reduce_dtype=DType(reduce_dtype)
)
# Calculate different memory components
calc = MemoryCalculation(model_config, parallelism_config, training_config)
# Get all memory calculations
param_memory = calc.calculate_parameter_memory()
activation_memory = calc.calculate_activation_memory()
gradient_memory = calc.calculate_gradient_memory()
optimizer_memory = calc.calculate_optimizer_memory()
# Create DataFrame for bar plot
memory_data = pd.DataFrame({
'Component': [
'Parameter Memory',
'Activation Memory',
'Gradient Memory',
'Optimizer Memory'
],
'Memory (GB)': [
param_memory / 1e9,
activation_memory / 1e9,
gradient_memory / 1e9,
optimizer_memory / 1e9
]
})
return gr.BarPlot(
value=memory_data,
x="Component",
y="Memory (GB)",
title="LLM Memory Usage Breakdown",
container=False,
y_lim=[0, None]
)
with gr.Blocks(theme='gstaff/xkcd') as demo:
with gr.Sidebar():
gr.Textbox("## LLM Memory Visualizer")
with gr.Column():
with gr.Row(equal_height=True):
tp, pp, cp, ep = create_parallelism_block()
layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, presets = create_model_block()
seq_len, batch_size, gradient_checkpointing, grad_accumulation, precision, mixed_precision, param_dtype, reduce_dtype = create_training_block()
calculate_button = gr.Button("Calculate")
output = gr.BarPlot(label="Memory Usage Breakdown")
calculate_button.click(
fn=calculate,
inputs=[tp, pp, cp, ep, layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, seq_len, batch_size, gradient_checkpointing, grad_accumulation, precision, mixed_precision, param_dtype, reduce_dtype],
outputs=output
)
demo.launch()