Spaces:

rubenaghayan
/

llm_memory_visualizer

Sleeping

App Files Files Community

llm_memory_visualizer / app.py

rubenaghayan

added support for precision

97e312a 3 months ago

raw

history blame

6.43 kB

	import gradio as gr
	import pandas as pd
	from functools import partial
	from defaults import DEFAULTS
	from state import Model, Parallelism, Training
	from calculator import MemoryCalculation
	from dtypes import DType

	# Create a Number component for natural numbers (positive integers)
	NaturalNumber = partial(gr.Number, minimum=1, step=1, precision=0, interactive=True)


	def greet(name, intensity) -> str:
	return "Hello, " + name + "!" * int(intensity)


	def create_parallelism_block():
	with gr.Column():
	gr.Markdown("# Parallelism")
	with gr.Group():
	tp = NaturalNumber(label="Tensor Parallelism", value=1)
	pp = NaturalNumber(label="Pipeline Parallelism", value=1)
	cp = NaturalNumber(label="Context Parallelism", value=1)
	ep = NaturalNumber(label="Expert Parallelism", value=1)
	return tp, pp, cp, ep


	def create_model_block():
	with gr.Column():
	gr.Markdown("# Model Architecture")
	layers = NaturalNumber(label="Number of Layers", value=32)
	vocab = NaturalNumber(label="Vocab Size", value=32000)
	hidden = NaturalNumber(label="Hidden Dim", value=4096)
	intermediate = NaturalNumber(label="Intermediate Dim", value=11008)
	is_moe = gr.Checkbox(label="Mixture of Experts (MoE)", value=False)
	active_experts = NaturalNumber(label="Active Experts", value=2, visible=False)
	total_experts = NaturalNumber(label="Total Experts", value=8, visible=False)

	# Toggle expert fields visibility based on MoE checkbox
	is_moe.change(
	fn=lambda x: [gr.update(visible=x), gr.update(visible=x)],
	inputs=is_moe,
	outputs=[active_experts, total_experts]
	)

	# not ready yet
	# presets = gr.Dropdown(list(DEFAULTS.keys()), label="Presets", interactive=True)
	return layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, presets


	def create_training_block():
	with gr.Column():
	gr.Markdown("# Training Config")
	seq_len = NaturalNumber(label="Sequence Length", value=8192)
	batch_size = NaturalNumber(label="Batch Size", info="If you are using gradient accumulation, enter microbatch size", value=8)
	with gr.Row():
	gradient_checkpointing = gr.Checkbox(label="Gradient Checkpointing", value=False)
	grad_accumulation = gr.Checkbox(label="Gradient Accumulation", value=False)
	precision = gr.Dropdown(DType.values(), label="Precision", value=DType.FP32.value, interactive=True)
	mixed_precision = gr.Checkbox(label="Mixed Precision", value=False)
	param_dtype = gr.Dropdown(DType.values(), label="Parameter Dtype", value=DType.FP32.value, interactive=True, visible=False)
	reduce_dtype = gr.Dropdown(DType.values(), label="Reduce Dtype", value=DType.FP32.value, interactive=True, visible=False)

	# Toggle dtype fields visibility based on mixed precision checkbox
	mixed_precision.change(
	fn=lambda x: [gr.update(visible=x), gr.update(visible=x)],
	inputs=mixed_precision,
	outputs=[param_dtype, reduce_dtype]
	)

	return seq_len, batch_size, gradient_checkpointing, grad_accumulation, precision, mixed_precision, param_dtype, reduce_dtype


	def calculate(tp, pp, cp, ep, layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, seq_len, batch_size, gradient_checkpointing, grad_accumulation, precision, mixed_precision, param_dtype, reduce_dtype):
	# Create state objects
	model_config = Model(
	vocab_size=int(vocab),
	num_layers=int(layers),
	hidden_dim=int(hidden),
	intermediate_size=int(intermediate),
	weight_tied_embeddings=True, # Default assumption
	active_experts=int(active_experts),
	total_experts=int(total_experts),
	is_moe=is_moe
	)

	parallelism_config = Parallelism(
	tensor_parallelism=int(tp),
	pipeline_parallelism=int(pp),
	context_parallelism=int(cp),
	expert_parallelism=int(ep)
	)

	training_config = Training(
	sequence_length=int(seq_len),
	batch_size=int(batch_size),
	gradient_checkpointing=gradient_checkpointing,
	grad_accumulation=grad_accumulation,
	precision=DType(precision),
	mixed_precision=mixed_precision,
	param_dtype=DType(param_dtype),
	reduce_dtype=DType(reduce_dtype)
	)

	# Calculate different memory components
	calc = MemoryCalculation(model_config, parallelism_config, training_config)

	# Get all memory calculations
	param_memory = calc.calculate_parameter_memory()
	activation_memory = calc.calculate_activation_memory()
	gradient_memory = calc.calculate_gradient_memory()
	optimizer_memory = calc.calculate_optimizer_memory()

	# Create DataFrame for bar plot
	memory_data = pd.DataFrame({
	'Component': [
	'Parameter Memory',
	'Activation Memory',
	'Gradient Memory',
	'Optimizer Memory'
	],
	'Memory (GB)': [
	param_memory / 1e9,
	activation_memory / 1e9,
	gradient_memory / 1e9,
	optimizer_memory / 1e9
	]
	})

	return gr.BarPlot(
	value=memory_data,
	x="Component",
	y="Memory (GB)",
	title="LLM Memory Usage Breakdown",
	container=False,
	y_lim=[0, None]
	)


	with gr.Blocks(theme='gstaff/xkcd') as demo:
	with gr.Sidebar():
	gr.Textbox("## LLM Memory Visualizer")
	with gr.Column():
	with gr.Row(equal_height=True):
	tp, pp, cp, ep = create_parallelism_block()
	layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, presets = create_model_block()
	seq_len, batch_size, gradient_checkpointing, grad_accumulation, precision, mixed_precision, param_dtype, reduce_dtype = create_training_block()
	calculate_button = gr.Button("Calculate")
	output = gr.BarPlot(label="Memory Usage Breakdown")

	calculate_button.click(
	fn=calculate,
	inputs=[tp, pp, cp, ep, layers, vocab, hidden, intermediate, active_experts, total_experts, is_moe, seq_len, batch_size, gradient_checkpointing, grad_accumulation, precision, mixed_precision, param_dtype, reduce_dtype],
	outputs=output
	)


	demo.launch()