llm_memory_visualizer / defaults.py
rubenaghayan's picture
fsdp + bugfixes
f45427d
from state import Model
# https://huggingface.co/google/gemma-3-270m/blob/main/config.json
GEMMA3_270M = Model(
vocab_size=262144,
num_layers=18,
hidden_dim=640,
intermediate_size=2048,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
GEMMA3_1B = Model(
vocab_size=262144,
num_layers=26,
hidden_dim=1152,
intermediate_size=6912,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
GEMMA3_4B = Model(
vocab_size=262144,
num_layers=34,
hidden_dim=2560,
intermediate_size=10240,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
GEMMA3_12B = Model(
vocab_size=262144,
num_layers=48,
hidden_dim=3840,
intermediate_size=15360,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
GEMMA3_27B = Model(
vocab_size=262144,
num_layers=62,
hidden_dim=5376,
intermediate_size=21504,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
# No maverick, don't support non-homogenous layers yet
# https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/config.json
LLAMA4_SCOUT = Model(
vocab_size=202048,
num_layers=48,
hidden_dim=5120,
intermediate_size=8192,
weight_tied_embeddings=True,
active_experts=2,
total_experts=17,
is_moe=True,
)
# https://huggingface.co/unsloth/Llama-3.2-1B-Instruct/blob/main/config.json
LLAMA3_1B = Model(
vocab_size=128256,
num_layers=16,
hidden_dim=2048,
intermediate_size=8192,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
# https://huggingface.co/unsloth/Llama-3.2-3B-Instruct/blob/main/config.json
LLAMA3_3B = Model(
vocab_size=128256,
num_layers=28,
hidden_dim=3072,
intermediate_size=8192,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
# https://huggingface.co/unsloth/llama-3-8b-Instruct/blob/main/config.json
LLAMA3_8B = Model(
vocab_size=128256,
num_layers=32,
hidden_dim=4096,
intermediate_size=14336,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
# https://huggingface.co/unsloth/Llama-3.3-70B-Instruct/blob/main/config.json
LLAMA3_70B = Model(
vocab_size=128256,
num_layers=80,
hidden_dim=8192,
intermediate_size=28672,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
DEFAULTS = {
"Gemma3 270M": GEMMA3_270M,
"Gemma3 1B": GEMMA3_1B,
"Gemma3 4B": GEMMA3_4B,
"Gemma3 12B": GEMMA3_12B,
"Gemma3 27B": GEMMA3_27B,
"Llama3 1B": LLAMA3_1B,
"Llama3 3B": LLAMA3_3B,
"Llama3 8B": LLAMA3_8B,
"Llama3 70B": LLAMA3_70B,
"Llama4 Scout": LLAMA4_SCOUT,
}