from state import Model # https://huggingface.co/google/gemma-3-270m/blob/main/config.json GEMMA3_270M = Model( vocab_size=262144, num_layers=18, hidden_dim=640, intermediate_size=2048, weight_tied_embeddings=True, active_experts=1, total_experts=1, is_moe=False, ) GEMMA3_1B = Model( vocab_size=262144, num_layers=26, hidden_dim=1152, intermediate_size=6912, weight_tied_embeddings=True, active_experts=1, total_experts=1, is_moe=False, ) GEMMA3_4B = Model( vocab_size=262144, num_layers=34, hidden_dim=2560, intermediate_size=10240, weight_tied_embeddings=True, active_experts=1, total_experts=1, is_moe=False, ) GEMMA3_12B = Model( vocab_size=262144, num_layers=48, hidden_dim=3840, intermediate_size=15360, weight_tied_embeddings=True, active_experts=1, total_experts=1, is_moe=False, ) GEMMA3_27B = Model( vocab_size=262144, num_layers=62, hidden_dim=5376, intermediate_size=21504, weight_tied_embeddings=True, active_experts=1, total_experts=1, is_moe=False, ) # No maverick, don't support non-homogenous layers yet # https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/config.json LLAMA4_SCOUT = Model( vocab_size=202048, num_layers=48, hidden_dim=5120, intermediate_size=8192, weight_tied_embeddings=True, active_experts=2, total_experts=17, is_moe=True, ) # https://huggingface.co/unsloth/Llama-3.2-1B-Instruct/blob/main/config.json LLAMA3_1B = Model( vocab_size=128256, num_layers=16, hidden_dim=2048, intermediate_size=8192, weight_tied_embeddings=True, active_experts=1, total_experts=1, is_moe=False, ) # https://huggingface.co/unsloth/Llama-3.2-3B-Instruct/blob/main/config.json LLAMA3_3B = Model( vocab_size=128256, num_layers=28, hidden_dim=3072, intermediate_size=8192, weight_tied_embeddings=True, active_experts=1, total_experts=1, is_moe=False, ) # https://huggingface.co/unsloth/llama-3-8b-Instruct/blob/main/config.json LLAMA3_8B = Model( vocab_size=128256, num_layers=32, hidden_dim=4096, intermediate_size=14336, weight_tied_embeddings=True, active_experts=1, total_experts=1, is_moe=False, ) # https://huggingface.co/unsloth/Llama-3.3-70B-Instruct/blob/main/config.json LLAMA3_70B = Model( vocab_size=128256, num_layers=80, hidden_dim=8192, intermediate_size=28672, weight_tied_embeddings=True, active_experts=1, total_experts=1, is_moe=False, ) DEFAULTS = { "Gemma3 270M": GEMMA3_270M, "Gemma3 1B": GEMMA3_1B, "Gemma3 4B": GEMMA3_4B, "Gemma3 12B": GEMMA3_12B, "Gemma3 27B": GEMMA3_27B, "Llama3 1B": LLAMA3_1B, "Llama3 3B": LLAMA3_3B, "Llama3 8B": LLAMA3_8B, "Llama3 70B": LLAMA3_70B, "Llama4 Scout": LLAMA4_SCOUT, }