modify code to use tt moe

Files changed (3) hide show

config.json CHANGED Viewed

@@ -2,6 +2,11 @@
   "architectures": [
     "Qwen3MoeForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 151643,

   "architectures": [
     "Qwen3MoeForCausalLM"
   ],
+  "auto_map": {
+    "AutoConfig": "configuration_qwen3_moe.Qwen3MoeConfig",
+    "AutoModelForCausalLM": "modeling_qwen3_moe.Qwen3MoeForCausalLM",
+    "AutoModel": "modeling_qwen3_moe.Qwen3MoeModel"
+  },
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 151643,

configuration_qwen3_moe.py CHANGED Viewed

@@ -14,9 +14,9 @@
 # limitations under the License.
 """Qwen3MoE model configuration"""
-from ...configuration_utils import PretrainedConfig
-from ...modeling_rope_utils import rope_config_validation
-from ...utils import logging
 logger = logging.get_logger(__name__)

 # limitations under the License.
 """Qwen3MoE model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
 logger = logging.get_logger(__name__)

modeling_qwen3_moe.py CHANGED Viewed

@@ -39,6 +39,8 @@ from transformers.utils import TransformersKwargs, auto_docstring, can_return_tu
 from transformers.utils.generic import OutputRecorder, check_model_inputs
 from .configuration_qwen3_moe import Qwen3MoeConfig
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
@@ -284,10 +286,22 @@ class Qwen3MoeDecoderLayer(GradientCheckpointingLayer):
         self.self_attn = Qwen3MoeAttention(config, layer_idx)
         if (layer_idx not in config.mlp_only_layers) and (
             config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
         ):
-            self.mlp = Qwen3MoeSparseMoeBlock(config)
         else:
             self.mlp = Qwen3MoeMLP(config, intermediate_size=config.intermediate_size)

 from transformers.utils.generic import OutputRecorder, check_model_inputs
 from .configuration_qwen3_moe import Qwen3MoeConfig
+from torchtitan.models.moe import MoE, MoEArgs
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
         self.self_attn = Qwen3MoeAttention(config, layer_idx)
+        moe_args = MoEArgs(
+            num_experts=config.num_experts,
+            num_shared_experts=0,
+            score_func="softmax",
+            route_norm=config.norm_topk_prob,
+            route_scale=1.0,
+            score_before_experts=False,
+            top_k=config.num_experts_per_tok,
+            use_grouped_mm=True,
+            load_balance_coeff=None,
+        )
         if (layer_idx not in config.mlp_only_layers) and (
             config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0
         ):
+            self.mlp = MoE(moe_args, dim=config.hidden_size, hidden_dim=config.moe_intermediate_size)
         else:
             self.mlp = Qwen3MoeMLP(config, intermediate_size=config.intermediate_size)