{ "vocab_size": 50271, "n_layer": 24, "n_head": 12, "n_embd": 768, "max_position_embeddings": 2048, "mlp_mult": 4, "num_funcs": 3, "router_dim": 128, "use_channel_attention": false, "dropout": 0.0, "layer_norm_epsilon": 1e-05, "router_aux_weight": 0.005, "use_flash_attn": true, "router_tau": 1.6, "rope_theta": 10000.0, "gradient_checkpointing": true, "_comment": "Ultra-deep reasoning config: 24L/12H/768d, mlp_mult=4.5 (~390M params). Maximum reasoning depth for RAG + multi-step inference. ~14-16GB VRAM." }