Spaces:

rubenaghayan
/

llm_memory_visualizer

Sleeping

App Files Files Community

llm_memory_visualizer / calculator.py

rubenaghayan

added support for precision

97e312a 3 months ago

raw

history blame

7.31 kB

	from state import Model as Model, Parallelism, Training
	from dtypes import DType


	class MemoryCalculation:
	def __init__(self, modelconfig: Model, parallelismconfig: Parallelism, trainingconfig: Training):
	self.model = modelconfig
	self.parallelism = parallelismconfig
	self.training = trainingconfig

	def calculate_num_parameters(self) -> float:
	# https://michaelwornow.net/2024/01/18/counting-params-in-transformer
	# https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=memory_usage_in_transformers

	# Biases are not added/omitted on a per-model basis for simplicity.
	# Just include them where they could appear. They're small in comparison to weights anyway and it forms an upper bound.

	#self tax
	b, s = self.training.batch_size, self.training.sequence_length
	h, i, l, v, e = (
	self.model.hidden_dim,
	self.model.intermediate_size,
	self.model.num_layers,
	self.model.vocab_size,
	self.model.total_experts,
	)
	tp, pp, ep = (
	self.parallelism.tensor_parallelism,
	self.parallelism.pipeline_parallelism,
	self.parallelism.expert_parallelism,
	)

	# Embedding layers
	input_embedding = v * h / tp
	unembedding = 0
	if not self.model.weight_tied_embeddings:
	unembedding = h * v / tp

	# Attention
	# weights and biases = *2
	layer_norm_attn_in = 2 * h # not tp sharded
	qkv = 3 * h * h / tp
	attn_output_proj = h * h + h / tp
	attn = layer_norm_attn_in + qkv + attn_output_proj

	# MLP
	layer_norm_mlp_in = 2 * h # not tp sharded
	router = h * e + e # assuming replicated for simplicity
	mlp_up_proj = h * i + i / tp
	mlp_gate_proj = h * i + i / tp
	mlp_down_proj = i * h + h / tp
	expert = mlp_up_proj + mlp_gate_proj + mlp_down_proj
	experts = expert * e / ep
	mlp = layer_norm_mlp_in + router + experts

	layer = attn + mlp
	layers = layer * l


	final_layer_norm = 2 * h # not tp sharded

	# pp and weight tying makes knowing where to embed layer challenging
	# going to assume "worst" case and it's at the end with final layer norm
	# even though that's pretty small
	total_params = 0
	if pp == 1:
	total_params = input_embedding + layers + unembedding + final_layer_norm
	if pp > 1:
	total_params = max(input_embedding, unembedding) + layers/pp + final_layer_norm
	return total_params

	def calculate_activation_parameters(self) -> float:
	# https://blog.eleuther.ai/transformer-math/#activations-and-batch-size
	# https://arxiv.org/abs/2205.05198
	# pp not considered since most pp schemes will run multiple concurrent batches to reduce the bubble
	b, s = self.training.batch_size, self.training.sequence_length
	h, i, l, v, e, ae = (
	self.model.hidden_dim,
	self.model.intermediate_size,
	self.model.num_layers,
	self.model.vocab_size,
	self.model.total_experts,
	self.model.active_experts,
	)
	tp, cp, pp, ep = (
	self.parallelism.tensor_parallelism,
	self.parallelism.context_parallelism,
	self.parallelism.pipeline_parallelism,
	self.parallelism.expert_parallelism,
	)
	sp = tp
	if self.training.gradient_checkpointing:
	# full recomputation
	embed = 0
	layer = s * b * h / cp / tp # only keep initial input to layer
	layers = layer * l
	embed = 0
	final_layer_out = (
	s * b * h / cp / sp
	)
	final_norm = s * b * h / cp / sp
	unembed = s * b * v / cp / tp
	logits = s * b * v / cp / sp # come back to this
	num_params = (
	embed + layers + final_layer_out + final_norm + unembed + logits
	)
	return num_params
	else:
	# assume flash attention ie do selective recomputation
	# assume tensor parallel + sequence parallel as described in https://arxiv.org/abs/2205.05198
	# the variables calculate the activation outputs
	# Attention Block
	layer_in = s * b * h / cp / tp
	attn_norm = s * b * h / cp / sp
	flash = s * b * h / cp / tp
	# everything else is recalculated by flash attention
	projection = s * b * h / cp / tp
	attn = layer_in + attn_norm + flash + projection
	# MLP Block
	mlp_norm = s * b * h / cp / sp

	mlp_up = s * b * i / cp / tp
	mlp_gate = s * b * i / cp / tp
	hadamard_swiglu = s * b * i / cp / tp
	mlp_down = s * b * h / cp / tp
	if self.model.is_moe:
	router = (
	s * b * e / cp / sp) # makes sense to sp shard if mlp_norm out is sp sharded
	expert = mlp_up + mlp_gate + hadamard_swiglu + mlp_down
	experts = expert * ae
	mlp = mlp_norm + router + experts
	else:
	mlp = mlp_norm + mlp_up + mlp_gate + hadamard_swiglu + mlp_down
	layer = attn + mlp
	layers = layer * l # no decrease from PP because schedules will increase microbatches
	# Other
	embed = 0
	final_layer_out = (
	s * b * h / cp / tp
	) # both sequence and context parallelism
	final_norm = s * b * h / cp / sp
	unembed = s * b * v / cp / tp
	logits = s * b * v / cp / tp
	num_params = (
	embed + layers + final_layer_out + final_norm + unembed + logits
	)
	return num_params

	def calculate_parameter_memory(self) -> float:
	if self.training.mixed_precision:
	master_copy = self.calculate_num_parameters() * self.training.precision
	working_copy = self.calculate_num_parameters() * self.training.param_dtype
	return master_copy + working_copy
	else:
	return self.calculate_num_parameters() * self.training.precision

	def calculate_gradient_memory(self) -> float:
	# https://blog.eleuther.ai/transformer-math/#gradients
	return (
	self.calculate_num_parameters() * 4
	) # gradients are same size as parameters

	def calculate_optimizer_memory(self) -> float:
	# https://blog.eleuther.ai/transformer-math/#optimizer-states
	# https://www.determined.ai/blog/act-mem-2, https://web.archive.org/web/20250308172134/https://www.determined.ai/blog/act-mem-2
	return (
	2 * self.calculate_num_parameters() * DType.FP32
	) # Adam optimizer with 2 states per parameter, assume always fp32

	def calculate_activation_memory(self) -> float:
	if self.training.mixed_precision:
	return self.calculate_activation_parameters() * self.training.param_dtype
	else:
	return (
	self.calculate_activation_parameters() * self.training.precision
	)